1 /*
2  *  splitgroupscommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 9/20/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9 
10 #include "splitgroupscommand.h"
11 #include "sequenceparser.h"
12 #include "counttable.h"
13 
14 //**********************************************************************************************************************
setParameters()15 vector<string> SplitGroupCommand::setParameters(){
16 	try {
17         CommandParameter plist("list", "InputTypes", "", "", "none", "none", "none","list",false,false,true); parameters.push_back(plist);
18         CommandParameter pflow("flow", "InputTypes", "", "", "none", "none", "none","fasta",false,false,true); parameters.push_back(pflow);
19 		CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none","fasta",false,false,true); parameters.push_back(pfasta);
20         CommandParameter pfastq("fastq", "InputTypes", "", "", "none", "none", "none","fastq",false,false,true); parameters.push_back(pfastq);
21         CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none","name",false,false,true); parameters.push_back(pname);
22         CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "CountGroup", "none","count",false,false,true); parameters.push_back(pcount);
23 		CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "CountGroup", "none","group",false,false,true); parameters.push_back(pgroup);
24 		CommandParameter pgroups("groups", "String", "", "", "", "", "","",false,false); parameters.push_back(pgroups);
25         CommandParameter pformat("format", "Multiple", "sanger-illumina-solexa-illumina1.8+", "sanger", "", "", "","",false,false,true); parameters.push_back(pformat);
26         CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
27 		CommandParameter pseed("seed", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pseed);
28         CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
29 		CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
30 
31         vector<string> tempOutNames;
32         outputTypes["fasta"] = tempOutNames;
33         outputTypes["flow"] = tempOutNames;
34         outputTypes["fastq"] = tempOutNames;
35         outputTypes["name"] = tempOutNames;
36         outputTypes["count"] = tempOutNames;
37         outputTypes["group"] = tempOutNames;
38         outputTypes["list"] = tempOutNames;
39 
40         abort = false; calledHelp = false;
41 
42 		vector<string> myArray;
43 		for (int i = 0; i < parameters.size(); i++) {	myArray.push_back(parameters[i].name);		}
44 		return myArray;
45 	}
46 	catch(exception& e) {
47 		m->errorOut(e, "SplitGroupCommand", "setParameters");
48 		exit(1);
49 	}
50 }
51 //**********************************************************************************************************************
getHelpString()52 string SplitGroupCommand::getHelpString(){
53 	try {
54 		string helpString = "";
55 		helpString += "The split.groups command reads a group or count file, and parses your files by groups. \n";
56 		helpString += "The split.groups command parameters are fasta, fastq, flow, name, group, count, groups and processors.\n";
57 		helpString += "The group or count parameter is required.\n";
58 		helpString += "The groups parameter allows you to select groups to create files for.  \n";
59         helpString += "The format parameter is used with the fastq parameter to indicate whether your sequences are sanger, solexa, illumina1.8+ or illumina, default=illumina1.8+.\n";
60 		helpString += "For example if you set groups=A-B-C, you will get a .A.fasta, .A.names, .B.fasta, .B.names, .C.fasta, .C.names files.  \n";
61 		helpString += "If you want .fasta and .names files for all groups, set groups=all.  \n";
62 		helpString += "The split.groups command should be used in the following format: split.group(fasta=yourFasta, group=yourGroupFile).\n";
63 		helpString += "Example: split.groups(fasta=abrecovery.fasta, group=abrecovery.groups).\n";
64 		;
65 		return helpString;
66 	}
67 	catch(exception& e) {
68 		m->errorOut(e, "SplitGroupCommand", "getHelpString");
69 		exit(1);
70 	}
71 }
72 //**********************************************************************************************************************
getOutputPattern(string type)73 string SplitGroupCommand::getOutputPattern(string type) {
74     try {
75         string pattern = "";
76 
77         if (type == "fasta")        {  pattern = "[filename],[group],fasta";        }
78         else if (type == "list")    {  pattern = "[filename],[group],list";         }
79         else if (type == "fastq")    {  pattern = "[filename],[group],fastq";       }
80         else if (type == "flow")    {  pattern = "[filename],[group],flow";         }
81         else if (type == "name")    {  pattern = "[filename],[group],names";        }
82         else if (type == "count")   {  pattern = "[filename],[group],count_table";  }
83         else if (type == "group")   {  pattern = "[filename],[group],groups";  }
84         else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->setControl_pressed(true);  }
85 
86         return pattern;
87     }
88     catch(exception& e) {
89         m->errorOut(e, "SplitGroupCommand", "getOutputPattern");
90         exit(1);
91     }
92 }
93 
94 //**********************************************************************************************************************
SplitGroupCommand(string option)95 SplitGroupCommand::SplitGroupCommand(string option) : Command()  {
96 	try {
97 		if(option == "help") { help(); abort = true; calledHelp = true; }
98 		else if(option == "citation") { citation(); abort = true; calledHelp = true;}
99         else if(option == "category") {  abort = true; calledHelp = true;  }
100 
101 		else {
102 			OptionParser parser(option, setParameters());
103 			map<string, string> parameters = parser.getParameters();
104 
105 			ValidParameters validParameter;
106 			namefile = validParameter.validFile(parameters, "name");
107 			if (namefile == "not open") { namefile = ""; abort = true; }
108 			else if (namefile == "not found") { namefile = ""; }
109 			else { current->setNameFile(namefile); }
110 
111 			fastafile = validParameter.validFile(parameters, "fasta");
112 			if (fastafile == "not open") { abort = true; }
113             else if (fastafile == "not found") {	fastafile = "";  }
114             else { current->setFastaFile(fastafile); }
115 
116             listfile = validParameter.validFile(parameters, "list");
117             if (listfile == "not open") { abort = true; }
118             else if (listfile == "not found") {	listfile = "";  }
119             else { current->setListFile(listfile); }
120 
121             fastqfile = validParameter.validFile(parameters, "fastq");
122             if (fastqfile == "not open") { abort = true; }
123             else if (fastqfile == "not found") {	fastqfile = "";  }
124 
125             flowfile = validParameter.validFile(parameters, "flow");
126             if (flowfile == "not open") { abort = true; }
127             else if (flowfile == "not found") {	flowfile = "";  }
128             else { current->setFlowFile(flowfile); }
129 
130 			groupfile = validParameter.validFile(parameters, "group");
131 			if (groupfile == "not open") {  groupfile = ""; abort = true; }
132 			else if (groupfile == "not found") { groupfile = "";
133 			}else {  current->setGroupFile(groupfile); }
134 
135             countfile = validParameter.validFile(parameters, "count");
136 			if (countfile == "not open") { countfile = ""; abort = true; }
137 			else if (countfile == "not found") { countfile = ""; }
138 			else { current->setCountFile(countfile); }
139 
140             if ((fastafile == "") && (flowfile == "") && (fastqfile == "")) {
141                 fastafile = current->getFastaFile();
142                 if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter.\n"); }
143                 else {
144                     flowfile = current->getFlowFile();
145                     if (flowfile != "") {  m->mothurOut("Using " + flowfile + " as input file for the flow parameter.\n");  }
146                     else {
147                         listfile = current->getListFile();
148                         if (listfile != "") {  m->mothurOut("Using " + listfile + " as input file for the list parameter.\n");  }
149                         else { m->mothurOut("[ERROR]: You need to provide a fasta, list, fastq or flow file.\n");  abort = true; }
150                     }
151                 }
152             }
153 
154             if ((countfile != "") && (namefile != "")) { m->mothurOut("You must enter ONLY ONE of the following: count or name.\n");  abort = true; }
155 
156             if ((countfile != "") && (groupfile != "")) { m->mothurOut("You must enter ONLY ONE of the following: count or group.\n");  abort = true; }
157 
158             if ((countfile == "") && (groupfile == "")) {
159                 if (namefile == "") { //check for count then group
160                     countfile = current->getCountFile();
161 					if (countfile != "") {  m->mothurOut("Using " + countfile + " as input file for the count parameter.\n");  }
162 					else {
163 						groupfile = current->getGroupFile();
164                         if (groupfile != "") {  m->mothurOut("Using " + groupfile + " as input file for the group parameter.\n");  }
165                         else {
166                             m->mothurOut("You need to provide a count or group file.\n");
167                             abort = true;
168                         }
169 					}
170                 }else { //check for group
171                     groupfile = current->getGroupFile();
172                     if (groupfile != "") {  m->mothurOut("Using " + groupfile + " as input file for the group parameter.\n");  }
173                     else {
174                         m->mothurOut("You need to provide a count or group file.\n");
175                         abort = true;
176                     }
177                 }
178             }
179 
180 			groups = validParameter.valid(parameters, "groups");
181 			if (groups == "not found") { groups = ""; }
182 			else {
183                 util.splitAtDash(groups, Groups);
184                 if (Groups.size() != 0) {
185                     if (Groups[0]== "all") { Groups.clear(); }
186                 }
187             }
188 
189             string temp = validParameter.valid(parameters, "processors");    if (temp == "not found"){    temp = current->getProcessors();    }
190             processors = current->setProcessors(temp);
191 
192             format = validParameter.valid(parameters, "format");		if (format == "not found"){	format = "illumina1.8+";	}
193 
194             if ((format != "sanger") && (format != "illumina") && (format != "illumina1.8+") && (format != "solexa"))  {
195                 m->mothurOut(format + " is not a valid format. Your format choices are sanger, solexa, illumina1.8+ and illumina, aborting.\n" );
196                 abort=true;
197             }
198 
199 
200             if (outputdir == ""){
201                 if (groupfile != "") { outputdir = util.hasPath(groupfile); }
202                 else { outputdir = util.hasPath(countfile);  }
203             }
204 
205             if (countfile == "") {
206                 if (namefile == "") {
207                     vector<string> files;
208                     if (fastafile != "")    {  files.push_back(fastafile);  }
209                     else                    {  files.push_back(flowfile);   }
210                     if (!current->getMothurCalling())  {  parser.getNameFile(files);  }
211                 }
212             }
213 		}
214 
215 	}
216 	catch(exception& e) {
217 		m->errorOut(e, "SplitGroupCommand", "SplitGroupCommand");
218 		exit(1);
219 	}
220 }
221 //**********************************************************************************************************************
execute()222 int SplitGroupCommand::execute(){
223 	try {
224 
225 		if (abort) { if (calledHelp) { return 0; }  return 2;	}
226 
227         vector<string> namesGroups;
228         if (groupfile != "") {
229             GroupMap groupMap(groupfile);
230             groupMap.readMap();
231             namesGroups = groupMap.getNamesOfGroups();
232         }else if (countfile != ""){
233             CountTable ct;
234             ct.readTable(countfile, true, true, Groups);
235             namesGroups = ct.getNamesOfGroups();
236         }else { m->mothurOut("[ERROR]: you must provide a count or group file to split by group. quitting... \n"); m->setControl_pressed(true);  return 0; }
237 
238         if (Groups.size() == 0) { Groups = namesGroups; }
239 
240         if (processors > Groups.size()) { processors = Groups.size(); m->mothurOut("Reducing processors to " + toString(Groups.size()) + ".\n"); }
241 
242         //divide the groups between the processors
243         int remainingPairs = Groups.size();
244         int startIndex = 0;
245         for (int remainingProcessors = processors; remainingProcessors > 0; remainingProcessors--) {
246           int numPairs = remainingPairs; //case for last processor
247           if (remainingProcessors != 1) { numPairs = ceil(remainingPairs / remainingProcessors); }
248           lines.push_back(linePair(startIndex, (startIndex+numPairs))); //startIndex, endIndex
249           startIndex = startIndex + numPairs;
250           remainingPairs = remainingPairs - numPairs;
251         }
252 
253 
254         if (flowfile != "")         {  splitFastqOrFlow(flowfile, ".flow");     }
255         if (fastqfile != "")        {  splitFastqOrFlow(fastqfile, ".fastq");   }
256         if ((fastafile != "") || (listfile != ""))      {
257             bool isCount = true;
258             if (countfile == "" )   {  isCount = false;  }
259             splitCountOrGroup(isCount);
260         }
261 
262 		if (m->getControl_pressed()) { for (int i = 0; i < outputNames.size(); i++) {	util.mothurRemove(outputNames[i]);	} return 0; }
263 
264 		string currentName = "";
265 		itTypes = outputTypes.find("fasta");
266 		if (itTypes != outputTypes.end()) {
267 			if ((itTypes->second).size() != 0) { currentName = (itTypes->second)[0]; current->setFastaFile(currentName); }
268 		}
269 
270         itTypes = outputTypes.find("flow");
271         if (itTypes != outputTypes.end()) {
272             if ((itTypes->second).size() != 0) { currentName = (itTypes->second)[0]; current->setFlowFile(currentName); }
273         }
274 
275 		itTypes = outputTypes.find("name");
276 		if (itTypes != outputTypes.end()) {
277 			if ((itTypes->second).size() != 0) { currentName = (itTypes->second)[0]; current->setNameFile(currentName); }
278 		}
279 
280         itTypes = outputTypes.find("count");
281 		if (itTypes != outputTypes.end()) {
282 			if ((itTypes->second).size() != 0) { currentName = (itTypes->second)[0]; current->setCountFile(currentName); }
283 		}
284 
285 		m->mothurOut("\nOutput File Names: \n");
286 		for (int i = 0; i < outputNames.size(); i++) {	m->mothurOut(outputNames[i] +"\n"); 	} m->mothurOutEndLine();
287 
288 		return 0;
289 	}
290 	catch(exception& e) {
291 		m->errorOut(e, "SplitGroupCommand", "execute");
292 		exit(1);
293 	}
294 }
295 //**********************************************************************************************************************
driverRunNameGroup(splitGroups2Struct * params)296 int driverRunNameGroup(splitGroups2Struct* params){
297 	try {
298 		if (params->m->getControl_pressed()) { return 0; }
299 
300         GroupMap groupMap; groupMap.readMap(params->groupfile, params->Groups);
301 		vector<string> namesGroups = groupMap.getNamesOfGroups();
302         if (params->Groups.size() == 0) { params->Groups = namesGroups; }
303 
304 		//GroupName -> files(fasta, list, group, name)
305         for (int i = 0; i < params->Groups.size(); i++) {
306 
307             vector<string> files;
308             map<string, vector<string> >::iterator it = params->group2Files.find(params->Groups[i]);
309 
310             if (it != params->group2Files.end()) { files = it->second; }
311             else { params->m->mothurOut("[ERROR]: Can find group " + params->Groups[i] + ", quitting.\n"); params->m->setControl_pressed(true); break; }
312 
313 			params->m->mothurOut("Processing group: " + params->Groups[i] + "\n");
314 
315             vector<string> namesSeqsInThisGroup = groupMap.getNamesSeqs(params->Groups[i]);
316             ofstream outGroup, outAccnos;
317             params->util.openOutputFile(files[2], outGroup);
318             params->util.openOutputFile(files[2]+".accnos", outAccnos);
319             for (long long j = 0; j < namesSeqsInThisGroup.size(); j++) {
320                 outGroup << namesSeqsInThisGroup[j] << '\t' << params->Groups[i] << endl;
321                 outAccnos << namesSeqsInThisGroup[j] << endl;
322             }
323             outGroup.close(); outAccnos.close();
324             params->outputNames.push_back(files[2]); params->outputTypes["group"].push_back(files[2]);
325 
326             //use unique.seqs to create new name and fastafile
327             string uniqueFasta = params->fastafile+params->Groups[i];
328             string uniqueName = params->namefile+params->Groups[i];
329             string uniqueList = params->listfile+params->Groups[i];
330 
331             string inputString = "dups=f, accnos=" + files[2]+".accnos";
332             if (params->namefile != "") {
333                 inputString += ", name=" + uniqueName;
334                 params->util.copyFile(params->namefile, uniqueName);
335             }
336             if (params->fastafile != "") {
337                 inputString += ", fasta=" + uniqueFasta;
338                 params->util.copyFile(params->fastafile, uniqueFasta);
339             }
340             if (params->listfile != "")  {
341                 inputString += ", list=" + uniqueList;
342                 params->util.copyFile(params->listfile, uniqueList);
343             }
344 
345             params->m->mothurOut("/******************************************/\n");
346             params->m->mothurOut("Running command: get.seqs(" + inputString + ")\n");
347 
348             Command* getCommand = new GetSeqsCommand(inputString);
349             getCommand->execute();
350             map<string, vector<string> > filenames = getCommand->getOutputFiles();
351             delete getCommand;
352 
353             if (params->fastafile != "") {
354                 params->util.renameFile(filenames["fasta"][0], files[0]);
355                 params->outputNames.push_back(files[0]); params->outputTypes["fasta"].push_back(files[0]);
356                 params->util.mothurRemove(uniqueFasta);
357             }
358             if (params->listfile != "") {
359                 params->util.renameFile(filenames["list"][0], files[1]);
360                 params->outputNames.push_back(files[1]); params->outputTypes["list"].push_back(files[1]);
361                 params->util.mothurRemove(uniqueList);
362             }
363             if (params->namefile != "") {
364                 params->util.renameFile(filenames["name"][0], files[3]);
365                 params->outputNames.push_back(files[3]); params->outputTypes["name"].push_back(files[3]);
366             }
367 
368             params->m->mothurOut("/******************************************/\nDone.\n");
369 
370             params->util.mothurRemove(files[2]+".accnos");
371             params->util.mothurRemove(uniqueName);
372 
373 			if (params->m->getControl_pressed()) {  for (int i = 0; i < params->outputNames.size(); i++) {	params->util.mothurRemove(params->outputNames[i]);	} return 0; }
374 		}
375 
376         return 0;
377     }
378 	catch(exception& e) {
379 		params->m->errorOut(e, "SplitGroupCommand", "driverRunNameGroup");
380 		exit(1);
381 	}
382 }
383 //**********************************************************************************************************************
driverRunCount(splitGroups2Struct * params)384 int driverRunCount(splitGroups2Struct* params){
385     try {
386         CountTable ct; ct.readTable(params->countfile, true, false, params->Groups);
387         if (!ct.hasGroupInfo()) { params->m->mothurOut("[ERROR]: your count file does not contain group info, cannot split by group.\n"); params->m->setControl_pressed(true); }
388 
389         if (params->m->getControl_pressed()) { return 0; }
390 
391         params->Groups = ct.getNamesOfGroups();
392 
393         string processTag = toString(params->start) + "_" + toString(params->end);
394         string uniqueFasta = params->fastafile+processTag;
395         string uniqueList = params->listfile+processTag;
396 
397         if (params->fastafile != "") { params->util.copyFile(params->fastafile, uniqueFasta); }else{ uniqueFasta = ""; }
398         if (params->listfile != "")  { params->util.copyFile(params->listfile, uniqueList); }else{ uniqueList = ""; }
399 
400         //GroupName -> files(fasta, list, count)
401         for (int i = 0; i < params->Groups.size(); i++) { //Groups only contains the samples assigned to this process
402 
403             vector<string> files;
404             map<string, vector<string> >::iterator it = params->group2Files.find(params->Groups[i]);
405 
406             if (it != params->group2Files.end()) { files = it->second; }
407             else { params->m->mothurOut("[ERROR]: Can find group " + params->Groups[i] + ", quitting.\n"); params->m->setControl_pressed(true); break; }
408 
409             string newCountFile = files[2];
410             vector<string> tempGroups; tempGroups.push_back(params->Groups[i]);
411             ct.printCompressedTable(newCountFile, tempGroups);
412             params->outputNames.push_back(newCountFile); params->outputTypes["count"].push_back(newCountFile);
413             vector<string> namesOfSeqsInGroup = ct.getNamesOfSeqs(params->Groups[i]);
414             set<string> thisGroupsNames = params->util.mothurConvert(namesOfSeqsInGroup);
415 
416             params->m->mothurOut("/******************************************/\n");
417             params->m->mothurOut("Selecting sequences for group " + params->Groups[i] + "\n\n");
418 
419             Command* getCommand = new GetSeqsCommand(thisGroupsNames, uniqueFasta, uniqueList, "", "", params->outputDir);
420 
421             map<string, vector<string> > filenames = getCommand->getOutputFiles();
422 
423             delete getCommand;
424 
425             if (params->fastafile != "") {
426                 params->util.renameFile(filenames["fasta"][0], files[0]);
427                 params->outputNames.push_back(files[0]); params->outputTypes["fasta"].push_back(files[0]);
428             }
429             if (params->listfile != "") {
430                 params->util.renameFile(filenames["list"][0], files[1]);
431                 params->outputNames.push_back(files[1]); params->outputTypes["list"].push_back(files[1]);
432             }
433 
434             params->m->mothurOut("/******************************************/\n\n");
435 
436             if (params->m->getControl_pressed()) {  for (int i = 0; i < params->outputNames.size(); i++) {	params->util.mothurRemove(params->outputNames[i]);	} break; }
437         }
438 
439         if (params->fastafile != "") { params->util.mothurRemove(uniqueFasta); }
440         if (params->listfile != "") { params->util.mothurRemove(uniqueList);   }
441 
442         return 0;
443     }
444     catch(exception& e) {
445         params->m->errorOut(e, "SplitGroupCommand", "driverRunCount");
446         exit(1);
447     }
448 }
449 //**********************************************************************************************************************
splitCountOrGroup(bool isCount)450 void SplitGroupCommand::splitCountOrGroup(bool isCount){
451     try {
452         //create array of worker threads
453         vector<std::thread*> workerThreads;
454         vector<splitGroups2Struct*> data;
455 
456         //Lauch worker threads
457         for (int i = 0; i < processors-1; i++) {
458             splitGroups2Struct* dataBundle = new splitGroups2Struct(groupfile, countfile, namefile, Groups, lines[i+1].start, lines[i+1].end);
459             dataBundle->setFiles(fastafile, listfile, outputdir);
460             data.push_back(dataBundle);
461 
462             if (isCount) {
463                 workerThreads.push_back(new std::thread(driverRunCount, dataBundle));
464             }else {
465                 workerThreads.push_back(new std::thread(driverRunNameGroup, dataBundle));
466             }
467         }
468 
469         splitGroups2Struct* dataBundle = new splitGroups2Struct(groupfile, countfile, namefile, Groups, lines[0].start, lines[0].end);
470         dataBundle->setFiles(fastafile, listfile, outputdir);
471         if (isCount) {
472             driverRunCount(dataBundle);
473         }else {
474             driverRunNameGroup(dataBundle);
475         }
476         outputNames.insert(outputNames.end(), dataBundle->outputNames.begin(), dataBundle->outputNames.end());
477         for (itTypes = dataBundle->outputTypes.begin(); itTypes != dataBundle->outputTypes.end(); itTypes++) {
478             outputTypes[itTypes->first].insert(outputTypes[itTypes->first].end(), itTypes->second.begin(), itTypes->second.end());
479         }
480 
481         delete dataBundle;
482 
483         for (int i = 0; i < processors-1; i++) {
484             workerThreads[i]->join();
485 
486             outputNames.insert(outputNames.end(), data[i]->outputNames.begin(), data[i]->outputNames.end());
487             for (itTypes = data[i]->outputTypes.begin(); itTypes != data[i]->outputTypes.end(); itTypes++) {
488                 outputTypes[itTypes->first].insert(outputTypes[itTypes->first].end(), itTypes->second.begin(), itTypes->second.end());
489             }
490 
491             delete data[i];
492             delete workerThreads[i];
493         }
494     }
495     catch(exception& e) {
496         m->errorOut(e, "SplitGroupCommand", "splitCountOrGroup");
497         exit(1);
498     }
499 }
500 //**********************************************************************************************************************
driverSplitFlow(splitGroupsStruct * params)501 int driverSplitFlow(splitGroupsStruct* params){
502     try {
503         GroupMap* groupMap = NULL;
504         CountTable* ct = NULL;
505         vector<string> namesGroups;
506         if (params->groupfile != "") {
507             groupMap = new GroupMap(params->groupfile);
508             groupMap->readMap();
509             namesGroups = groupMap->getNamesOfGroups();
510         }else if (params->countfile != ""){
511             ct = new CountTable();
512             ct->readTable(params->countfile, true, true, params->Groups);
513             namesGroups = ct->getNamesOfGroups();
514         }else { params->m->mothurOut("[ERROR]: you must provide a count or group file to split by group. quitting... \n"); params->m->setControl_pressed(true);  }
515 
516         if (params->Groups.size() == 0) { params->Groups = namesGroups; }
517 
518         if (params->m->getControl_pressed()) { if (groupMap != NULL) { delete groupMap; }else if (ct != NULL) { delete ct; } return 0; }
519 
520         string name, flows;
521         int count = 0;
522         ifstream in; params->util.openInputFile(params->inputFileName, in);
523         in >> flows; params->util.gobble(in);
524 
525         while (!in.eof()) {
526             if (params->m->getControl_pressed()) { break; }
527 
528             in >> name; params->util.gobble(in);
529             flows = params->util.getline(in); params->util.gobble(in);
530 
531             vector<string> thisSeqsGroups;
532             if (groupMap != NULL) {
533                 string thisGroup = groupMap->getGroup(name);
534                 thisSeqsGroups.push_back(thisGroup);
535             }else if (ct != NULL) { thisSeqsGroups  = ct->getGroups(name); }
536 
537             for (int i = 0; i < thisSeqsGroups.size(); i++) {
538 
539                 map<string, flowOutput>::iterator it = params->parsedFlowData.find(thisSeqsGroups[i]);
540 
541                 if (it != params->parsedFlowData.end()) {
542                     it->second.total++; it->second.output += name + ' ' + flows + '\n';
543                     if (it->second.total % 100 == 0) { //buffer write
544                         ofstream out; params->util.openOutputFileAppend(it->second.filename, out);
545                         out << it->second.output; it->second.output = ""; out.close();
546                     }
547                 } //else not in the groups we are looking to parse, so ignore
548             }
549 
550             count++;
551         }
552 
553         //output rest
554         for (map<string, flowOutput>::iterator it = params->parsedFlowData.begin(); it != params->parsedFlowData.end(); it++) {
555             if (params->m->getControl_pressed()) { break; }
556 
557             if (it->second.output != "") { //more seqs to output
558                 ofstream out; params->util.openOutputFileAppend(it->second.filename, out);
559                 out << it->second.output; it->second.output = ""; params->outputNames.push_back(it->second.filename); params->outputTypes["flow"].push_back(it->second.filename);
560             }else if (it->second.total == 0) { //no seqs for this group, remove file
561                 params->util.mothurRemove(it->second.filename);
562             }else { //finished writing, just add to list of output files
563                 params->outputNames.push_back(it->second.filename); params->outputTypes["flow"].push_back(it->second.filename);
564             }
565         }
566 
567         if (params->m->getControl_pressed()) { if (groupMap != NULL) { delete groupMap; }else if (ct != NULL) { delete ct; } return 0; }
568 
569         return count;
570     }
571     catch(exception& e) {
572         params->m->errorOut(e, "SplitGroupCommand", "driverSplitFlow");
573         exit(1);
574     }
575 }
576 
577 //**********************************************************************************************************************
driverSplitFastq(splitGroupsStruct * params)578 int driverSplitFastq(splitGroupsStruct* params){
579     try {
580         GroupMap* groupMap = NULL;
581         CountTable* ct = NULL;
582         vector<string> namesGroups;
583         if (params->groupfile != "") {
584             groupMap = new GroupMap(params->groupfile);
585             groupMap->readMap();
586             namesGroups = groupMap->getNamesOfGroups();
587         }else if (params->countfile != ""){
588             ct = new CountTable();
589             ct->readTable(params->countfile, true, true, params->Groups);
590             namesGroups = ct->getNamesOfGroups();
591         }else { params->m->mothurOut("[ERROR]: you must provide a count or group file to split by group. quitting... \n"); params->m->setControl_pressed(true);  return 0; }
592 
593         if (params->m->getControl_pressed()) { if (groupMap != NULL) { delete groupMap; }else if (ct != NULL) { delete ct; } return 0; }
594 
595         int count = 0;
596         ifstream in; params->util.openInputFile(params->inputFileName, in);
597 
598         while (!in.eof()) {
599             if (params->m->getControl_pressed()) { break; }
600 
601             bool ignore = false;
602             FastqRead thisRead(in, ignore, params->format); params->util.gobble(in);
603             string name = thisRead.getName();
604 
605             vector<string> thisSeqsGroups;
606             if (groupMap != NULL) {
607                 string thisGroup = groupMap->getGroup(name);
608                 thisSeqsGroups.push_back(thisGroup);
609             }else if (ct != NULL) { thisSeqsGroups  = ct->getGroups(name); }
610 
611             for (int i = 0; i < thisSeqsGroups.size(); i++) {
612 
613                 map<string, fastqOutput>::iterator it = params->parsedFastqData.find(thisSeqsGroups[i]);
614 
615                 if (it != params->parsedFastqData.end()) {
616                     it->second.total++; it->second.output.push_back(thisRead);
617                     if (it->second.total % 500 == 0) { //buffer write
618                         ofstream out; params->util.openOutputFileAppend(it->second.filename, out);
619                         for (int j = 0; j < it->second.output.size(); j++) { it->second.output[j].printFastq(out); }
620                         it->second.output.clear(); out.close();
621                     }
622                 } //else not in the groups we are looking to parse, so ignore
623             }
624 
625             count++;
626         }
627 
628         //output rest
629         for (map<string, fastqOutput>::iterator it = params->parsedFastqData.begin(); it != params->parsedFastqData.end(); it++) {
630             if (params->m->getControl_pressed()) { break; }
631 
632             if (it->second.output.size() != 0) { //more seqs to output
633                 ofstream out; params->util.openOutputFileAppend(it->second.filename, out);
634                 for (int j = 0; j < it->second.output.size(); j++) { it->second.output[j].printFastq(out); }
635                 it->second.output.clear(); out.close();
636                 params->outputNames.push_back(it->second.filename); params->outputTypes["fastq"].push_back(it->second.filename);
637             }else if (it->second.total == 0) { //no seqs for this group, remove file
638                 params->util.mothurRemove(it->second.filename);
639             }else { //finished writing, just add to list of output files
640                 params->outputNames.push_back(it->second.filename); params->outputTypes["fastq"].push_back(it->second.filename);
641             }
642         }
643 
644         if (params->m->getControl_pressed()) { if (groupMap != NULL) { delete groupMap; }else if (ct != NULL) { delete ct; } return 0; }
645 
646         return count;
647 
648     }
649     catch(exception& e) {
650         params->m->errorOut(e, "SplitGroupCommand", "driverSplitFastq");
651         exit(1);
652     }
653 }
654 //**********************************************************************************************************************
splitFastqOrFlow(string inputFile,string extension)655 void SplitGroupCommand::splitFastqOrFlow(string inputFile, string extension){
656     try {
657         //create array of worker threads
658         vector<std::thread*> workerThreads;
659         vector<splitGroupsStruct*> data;
660 
661         string outputfileRoot = outputdir + util.getRootName(util.getSimpleName(inputFile));
662 
663         //Lauch worker threads
664         for (int i = 0; i < processors-1; i++) {
665             splitGroupsStruct* dataBundle = new splitGroupsStruct(groupfile, countfile, namefile, Groups, lines[i+1].start, lines[i+1].end);
666             dataBundle->setFiles(inputFile, outputfileRoot, extension);
667             dataBundle->setFormat(format);
668             data.push_back(dataBundle);
669 
670             if (extension == ".fastq") {
671                 workerThreads.push_back(new std::thread(driverSplitFastq, dataBundle));
672             }else {
673                 workerThreads.push_back(new std::thread(driverSplitFlow, dataBundle));
674             }
675         }
676 
677         splitGroupsStruct* dataBundle = new splitGroupsStruct(groupfile, countfile, namefile, Groups, lines[0].start, lines[0].end);
678         dataBundle->setFiles(inputFile, outputfileRoot, extension);
679         dataBundle->setFormat(format);
680 
681         if (extension == ".fastq")  { driverSplitFastq(dataBundle); }
682         else                        { driverSplitFlow(dataBundle);   }
683 
684         outputNames.insert(outputNames.end(), dataBundle->outputNames.begin(), dataBundle->outputNames.end());
685         for (itTypes = dataBundle->outputTypes.begin(); itTypes != dataBundle->outputTypes.end(); itTypes++) {
686             outputTypes[itTypes->first].insert(outputTypes[itTypes->first].end(), itTypes->second.begin(), itTypes->second.end());
687         }
688 
689         for (int i = 0; i < processors-1; i++) {
690             workerThreads[i]->join();
691 
692             outputNames.insert(outputNames.end(), data[i]->outputNames.begin(), data[i]->outputNames.end());
693             for (itTypes = data[i]->outputTypes.begin(); itTypes != data[i]->outputTypes.end(); itTypes++) {
694                 outputTypes[itTypes->first].insert(outputTypes[itTypes->first].end(), itTypes->second.begin(), itTypes->second.end());
695             }
696 
697             delete data[i];
698             delete workerThreads[i];
699         }
700 
701         delete dataBundle;
702     }
703     catch(exception& e) {
704         m->errorOut(e, "SplitGroupCommand", "splitFastqOrFlow");
705         exit(1);
706     }
707 }
708 //**********************************************************************************************************************
709 
710 
711 
712