1 /*
2 * splitgroupscommand.cpp
3 * Mothur
4 *
5 * Created by westcott on 9/20/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
7 *
8 */
9
10 #include "splitgroupscommand.h"
11 #include "sequenceparser.h"
12 #include "counttable.h"
13
14 //**********************************************************************************************************************
setParameters()15 vector<string> SplitGroupCommand::setParameters(){
16 try {
17 CommandParameter plist("list", "InputTypes", "", "", "none", "none", "none","list",false,false,true); parameters.push_back(plist);
18 CommandParameter pflow("flow", "InputTypes", "", "", "none", "none", "none","fasta",false,false,true); parameters.push_back(pflow);
19 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none","fasta",false,false,true); parameters.push_back(pfasta);
20 CommandParameter pfastq("fastq", "InputTypes", "", "", "none", "none", "none","fastq",false,false,true); parameters.push_back(pfastq);
21 CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none","name",false,false,true); parameters.push_back(pname);
22 CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "CountGroup", "none","count",false,false,true); parameters.push_back(pcount);
23 CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "CountGroup", "none","group",false,false,true); parameters.push_back(pgroup);
24 CommandParameter pgroups("groups", "String", "", "", "", "", "","",false,false); parameters.push_back(pgroups);
25 CommandParameter pformat("format", "Multiple", "sanger-illumina-solexa-illumina1.8+", "sanger", "", "", "","",false,false,true); parameters.push_back(pformat);
26 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
27 CommandParameter pseed("seed", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pseed);
28 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
29 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
30
31 vector<string> tempOutNames;
32 outputTypes["fasta"] = tempOutNames;
33 outputTypes["flow"] = tempOutNames;
34 outputTypes["fastq"] = tempOutNames;
35 outputTypes["name"] = tempOutNames;
36 outputTypes["count"] = tempOutNames;
37 outputTypes["group"] = tempOutNames;
38 outputTypes["list"] = tempOutNames;
39
40 abort = false; calledHelp = false;
41
42 vector<string> myArray;
43 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
44 return myArray;
45 }
46 catch(exception& e) {
47 m->errorOut(e, "SplitGroupCommand", "setParameters");
48 exit(1);
49 }
50 }
51 //**********************************************************************************************************************
getHelpString()52 string SplitGroupCommand::getHelpString(){
53 try {
54 string helpString = "";
55 helpString += "The split.groups command reads a group or count file, and parses your files by groups. \n";
56 helpString += "The split.groups command parameters are fasta, fastq, flow, name, group, count, groups and processors.\n";
57 helpString += "The group or count parameter is required.\n";
58 helpString += "The groups parameter allows you to select groups to create files for. \n";
59 helpString += "The format parameter is used with the fastq parameter to indicate whether your sequences are sanger, solexa, illumina1.8+ or illumina, default=illumina1.8+.\n";
60 helpString += "For example if you set groups=A-B-C, you will get a .A.fasta, .A.names, .B.fasta, .B.names, .C.fasta, .C.names files. \n";
61 helpString += "If you want .fasta and .names files for all groups, set groups=all. \n";
62 helpString += "The split.groups command should be used in the following format: split.group(fasta=yourFasta, group=yourGroupFile).\n";
63 helpString += "Example: split.groups(fasta=abrecovery.fasta, group=abrecovery.groups).\n";
64 ;
65 return helpString;
66 }
67 catch(exception& e) {
68 m->errorOut(e, "SplitGroupCommand", "getHelpString");
69 exit(1);
70 }
71 }
72 //**********************************************************************************************************************
getOutputPattern(string type)73 string SplitGroupCommand::getOutputPattern(string type) {
74 try {
75 string pattern = "";
76
77 if (type == "fasta") { pattern = "[filename],[group],fasta"; }
78 else if (type == "list") { pattern = "[filename],[group],list"; }
79 else if (type == "fastq") { pattern = "[filename],[group],fastq"; }
80 else if (type == "flow") { pattern = "[filename],[group],flow"; }
81 else if (type == "name") { pattern = "[filename],[group],names"; }
82 else if (type == "count") { pattern = "[filename],[group],count_table"; }
83 else if (type == "group") { pattern = "[filename],[group],groups"; }
84 else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->setControl_pressed(true); }
85
86 return pattern;
87 }
88 catch(exception& e) {
89 m->errorOut(e, "SplitGroupCommand", "getOutputPattern");
90 exit(1);
91 }
92 }
93
94 //**********************************************************************************************************************
SplitGroupCommand(string option)95 SplitGroupCommand::SplitGroupCommand(string option) : Command() {
96 try {
97 if(option == "help") { help(); abort = true; calledHelp = true; }
98 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
99 else if(option == "category") { abort = true; calledHelp = true; }
100
101 else {
102 OptionParser parser(option, setParameters());
103 map<string, string> parameters = parser.getParameters();
104
105 ValidParameters validParameter;
106 namefile = validParameter.validFile(parameters, "name");
107 if (namefile == "not open") { namefile = ""; abort = true; }
108 else if (namefile == "not found") { namefile = ""; }
109 else { current->setNameFile(namefile); }
110
111 fastafile = validParameter.validFile(parameters, "fasta");
112 if (fastafile == "not open") { abort = true; }
113 else if (fastafile == "not found") { fastafile = ""; }
114 else { current->setFastaFile(fastafile); }
115
116 listfile = validParameter.validFile(parameters, "list");
117 if (listfile == "not open") { abort = true; }
118 else if (listfile == "not found") { listfile = ""; }
119 else { current->setListFile(listfile); }
120
121 fastqfile = validParameter.validFile(parameters, "fastq");
122 if (fastqfile == "not open") { abort = true; }
123 else if (fastqfile == "not found") { fastqfile = ""; }
124
125 flowfile = validParameter.validFile(parameters, "flow");
126 if (flowfile == "not open") { abort = true; }
127 else if (flowfile == "not found") { flowfile = ""; }
128 else { current->setFlowFile(flowfile); }
129
130 groupfile = validParameter.validFile(parameters, "group");
131 if (groupfile == "not open") { groupfile = ""; abort = true; }
132 else if (groupfile == "not found") { groupfile = "";
133 }else { current->setGroupFile(groupfile); }
134
135 countfile = validParameter.validFile(parameters, "count");
136 if (countfile == "not open") { countfile = ""; abort = true; }
137 else if (countfile == "not found") { countfile = ""; }
138 else { current->setCountFile(countfile); }
139
140 if ((fastafile == "") && (flowfile == "") && (fastqfile == "")) {
141 fastafile = current->getFastaFile();
142 if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter.\n"); }
143 else {
144 flowfile = current->getFlowFile();
145 if (flowfile != "") { m->mothurOut("Using " + flowfile + " as input file for the flow parameter.\n"); }
146 else {
147 listfile = current->getListFile();
148 if (listfile != "") { m->mothurOut("Using " + listfile + " as input file for the list parameter.\n"); }
149 else { m->mothurOut("[ERROR]: You need to provide a fasta, list, fastq or flow file.\n"); abort = true; }
150 }
151 }
152 }
153
154 if ((countfile != "") && (namefile != "")) { m->mothurOut("You must enter ONLY ONE of the following: count or name.\n"); abort = true; }
155
156 if ((countfile != "") && (groupfile != "")) { m->mothurOut("You must enter ONLY ONE of the following: count or group.\n"); abort = true; }
157
158 if ((countfile == "") && (groupfile == "")) {
159 if (namefile == "") { //check for count then group
160 countfile = current->getCountFile();
161 if (countfile != "") { m->mothurOut("Using " + countfile + " as input file for the count parameter.\n"); }
162 else {
163 groupfile = current->getGroupFile();
164 if (groupfile != "") { m->mothurOut("Using " + groupfile + " as input file for the group parameter.\n"); }
165 else {
166 m->mothurOut("You need to provide a count or group file.\n");
167 abort = true;
168 }
169 }
170 }else { //check for group
171 groupfile = current->getGroupFile();
172 if (groupfile != "") { m->mothurOut("Using " + groupfile + " as input file for the group parameter.\n"); }
173 else {
174 m->mothurOut("You need to provide a count or group file.\n");
175 abort = true;
176 }
177 }
178 }
179
180 groups = validParameter.valid(parameters, "groups");
181 if (groups == "not found") { groups = ""; }
182 else {
183 util.splitAtDash(groups, Groups);
184 if (Groups.size() != 0) {
185 if (Groups[0]== "all") { Groups.clear(); }
186 }
187 }
188
189 string temp = validParameter.valid(parameters, "processors"); if (temp == "not found"){ temp = current->getProcessors(); }
190 processors = current->setProcessors(temp);
191
192 format = validParameter.valid(parameters, "format"); if (format == "not found"){ format = "illumina1.8+"; }
193
194 if ((format != "sanger") && (format != "illumina") && (format != "illumina1.8+") && (format != "solexa")) {
195 m->mothurOut(format + " is not a valid format. Your format choices are sanger, solexa, illumina1.8+ and illumina, aborting.\n" );
196 abort=true;
197 }
198
199
200 if (outputdir == ""){
201 if (groupfile != "") { outputdir = util.hasPath(groupfile); }
202 else { outputdir = util.hasPath(countfile); }
203 }
204
205 if (countfile == "") {
206 if (namefile == "") {
207 vector<string> files;
208 if (fastafile != "") { files.push_back(fastafile); }
209 else { files.push_back(flowfile); }
210 if (!current->getMothurCalling()) { parser.getNameFile(files); }
211 }
212 }
213 }
214
215 }
216 catch(exception& e) {
217 m->errorOut(e, "SplitGroupCommand", "SplitGroupCommand");
218 exit(1);
219 }
220 }
221 //**********************************************************************************************************************
execute()222 int SplitGroupCommand::execute(){
223 try {
224
225 if (abort) { if (calledHelp) { return 0; } return 2; }
226
227 vector<string> namesGroups;
228 if (groupfile != "") {
229 GroupMap groupMap(groupfile);
230 groupMap.readMap();
231 namesGroups = groupMap.getNamesOfGroups();
232 }else if (countfile != ""){
233 CountTable ct;
234 ct.readTable(countfile, true, true, Groups);
235 namesGroups = ct.getNamesOfGroups();
236 }else { m->mothurOut("[ERROR]: you must provide a count or group file to split by group. quitting... \n"); m->setControl_pressed(true); return 0; }
237
238 if (Groups.size() == 0) { Groups = namesGroups; }
239
240 if (processors > Groups.size()) { processors = Groups.size(); m->mothurOut("Reducing processors to " + toString(Groups.size()) + ".\n"); }
241
242 //divide the groups between the processors
243 int remainingPairs = Groups.size();
244 int startIndex = 0;
245 for (int remainingProcessors = processors; remainingProcessors > 0; remainingProcessors--) {
246 int numPairs = remainingPairs; //case for last processor
247 if (remainingProcessors != 1) { numPairs = ceil(remainingPairs / remainingProcessors); }
248 lines.push_back(linePair(startIndex, (startIndex+numPairs))); //startIndex, endIndex
249 startIndex = startIndex + numPairs;
250 remainingPairs = remainingPairs - numPairs;
251 }
252
253
254 if (flowfile != "") { splitFastqOrFlow(flowfile, ".flow"); }
255 if (fastqfile != "") { splitFastqOrFlow(fastqfile, ".fastq"); }
256 if ((fastafile != "") || (listfile != "")) {
257 bool isCount = true;
258 if (countfile == "" ) { isCount = false; }
259 splitCountOrGroup(isCount);
260 }
261
262 if (m->getControl_pressed()) { for (int i = 0; i < outputNames.size(); i++) { util.mothurRemove(outputNames[i]); } return 0; }
263
264 string currentName = "";
265 itTypes = outputTypes.find("fasta");
266 if (itTypes != outputTypes.end()) {
267 if ((itTypes->second).size() != 0) { currentName = (itTypes->second)[0]; current->setFastaFile(currentName); }
268 }
269
270 itTypes = outputTypes.find("flow");
271 if (itTypes != outputTypes.end()) {
272 if ((itTypes->second).size() != 0) { currentName = (itTypes->second)[0]; current->setFlowFile(currentName); }
273 }
274
275 itTypes = outputTypes.find("name");
276 if (itTypes != outputTypes.end()) {
277 if ((itTypes->second).size() != 0) { currentName = (itTypes->second)[0]; current->setNameFile(currentName); }
278 }
279
280 itTypes = outputTypes.find("count");
281 if (itTypes != outputTypes.end()) {
282 if ((itTypes->second).size() != 0) { currentName = (itTypes->second)[0]; current->setCountFile(currentName); }
283 }
284
285 m->mothurOut("\nOutput File Names: \n");
286 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i] +"\n"); } m->mothurOutEndLine();
287
288 return 0;
289 }
290 catch(exception& e) {
291 m->errorOut(e, "SplitGroupCommand", "execute");
292 exit(1);
293 }
294 }
295 //**********************************************************************************************************************
driverRunNameGroup(splitGroups2Struct * params)296 int driverRunNameGroup(splitGroups2Struct* params){
297 try {
298 if (params->m->getControl_pressed()) { return 0; }
299
300 GroupMap groupMap; groupMap.readMap(params->groupfile, params->Groups);
301 vector<string> namesGroups = groupMap.getNamesOfGroups();
302 if (params->Groups.size() == 0) { params->Groups = namesGroups; }
303
304 //GroupName -> files(fasta, list, group, name)
305 for (int i = 0; i < params->Groups.size(); i++) {
306
307 vector<string> files;
308 map<string, vector<string> >::iterator it = params->group2Files.find(params->Groups[i]);
309
310 if (it != params->group2Files.end()) { files = it->second; }
311 else { params->m->mothurOut("[ERROR]: Can find group " + params->Groups[i] + ", quitting.\n"); params->m->setControl_pressed(true); break; }
312
313 params->m->mothurOut("Processing group: " + params->Groups[i] + "\n");
314
315 vector<string> namesSeqsInThisGroup = groupMap.getNamesSeqs(params->Groups[i]);
316 ofstream outGroup, outAccnos;
317 params->util.openOutputFile(files[2], outGroup);
318 params->util.openOutputFile(files[2]+".accnos", outAccnos);
319 for (long long j = 0; j < namesSeqsInThisGroup.size(); j++) {
320 outGroup << namesSeqsInThisGroup[j] << '\t' << params->Groups[i] << endl;
321 outAccnos << namesSeqsInThisGroup[j] << endl;
322 }
323 outGroup.close(); outAccnos.close();
324 params->outputNames.push_back(files[2]); params->outputTypes["group"].push_back(files[2]);
325
326 //use unique.seqs to create new name and fastafile
327 string uniqueFasta = params->fastafile+params->Groups[i];
328 string uniqueName = params->namefile+params->Groups[i];
329 string uniqueList = params->listfile+params->Groups[i];
330
331 string inputString = "dups=f, accnos=" + files[2]+".accnos";
332 if (params->namefile != "") {
333 inputString += ", name=" + uniqueName;
334 params->util.copyFile(params->namefile, uniqueName);
335 }
336 if (params->fastafile != "") {
337 inputString += ", fasta=" + uniqueFasta;
338 params->util.copyFile(params->fastafile, uniqueFasta);
339 }
340 if (params->listfile != "") {
341 inputString += ", list=" + uniqueList;
342 params->util.copyFile(params->listfile, uniqueList);
343 }
344
345 params->m->mothurOut("/******************************************/\n");
346 params->m->mothurOut("Running command: get.seqs(" + inputString + ")\n");
347
348 Command* getCommand = new GetSeqsCommand(inputString);
349 getCommand->execute();
350 map<string, vector<string> > filenames = getCommand->getOutputFiles();
351 delete getCommand;
352
353 if (params->fastafile != "") {
354 params->util.renameFile(filenames["fasta"][0], files[0]);
355 params->outputNames.push_back(files[0]); params->outputTypes["fasta"].push_back(files[0]);
356 params->util.mothurRemove(uniqueFasta);
357 }
358 if (params->listfile != "") {
359 params->util.renameFile(filenames["list"][0], files[1]);
360 params->outputNames.push_back(files[1]); params->outputTypes["list"].push_back(files[1]);
361 params->util.mothurRemove(uniqueList);
362 }
363 if (params->namefile != "") {
364 params->util.renameFile(filenames["name"][0], files[3]);
365 params->outputNames.push_back(files[3]); params->outputTypes["name"].push_back(files[3]);
366 }
367
368 params->m->mothurOut("/******************************************/\nDone.\n");
369
370 params->util.mothurRemove(files[2]+".accnos");
371 params->util.mothurRemove(uniqueName);
372
373 if (params->m->getControl_pressed()) { for (int i = 0; i < params->outputNames.size(); i++) { params->util.mothurRemove(params->outputNames[i]); } return 0; }
374 }
375
376 return 0;
377 }
378 catch(exception& e) {
379 params->m->errorOut(e, "SplitGroupCommand", "driverRunNameGroup");
380 exit(1);
381 }
382 }
383 //**********************************************************************************************************************
driverRunCount(splitGroups2Struct * params)384 int driverRunCount(splitGroups2Struct* params){
385 try {
386 CountTable ct; ct.readTable(params->countfile, true, false, params->Groups);
387 if (!ct.hasGroupInfo()) { params->m->mothurOut("[ERROR]: your count file does not contain group info, cannot split by group.\n"); params->m->setControl_pressed(true); }
388
389 if (params->m->getControl_pressed()) { return 0; }
390
391 params->Groups = ct.getNamesOfGroups();
392
393 string processTag = toString(params->start) + "_" + toString(params->end);
394 string uniqueFasta = params->fastafile+processTag;
395 string uniqueList = params->listfile+processTag;
396
397 if (params->fastafile != "") { params->util.copyFile(params->fastafile, uniqueFasta); }else{ uniqueFasta = ""; }
398 if (params->listfile != "") { params->util.copyFile(params->listfile, uniqueList); }else{ uniqueList = ""; }
399
400 //GroupName -> files(fasta, list, count)
401 for (int i = 0; i < params->Groups.size(); i++) { //Groups only contains the samples assigned to this process
402
403 vector<string> files;
404 map<string, vector<string> >::iterator it = params->group2Files.find(params->Groups[i]);
405
406 if (it != params->group2Files.end()) { files = it->second; }
407 else { params->m->mothurOut("[ERROR]: Can find group " + params->Groups[i] + ", quitting.\n"); params->m->setControl_pressed(true); break; }
408
409 string newCountFile = files[2];
410 vector<string> tempGroups; tempGroups.push_back(params->Groups[i]);
411 ct.printCompressedTable(newCountFile, tempGroups);
412 params->outputNames.push_back(newCountFile); params->outputTypes["count"].push_back(newCountFile);
413 vector<string> namesOfSeqsInGroup = ct.getNamesOfSeqs(params->Groups[i]);
414 set<string> thisGroupsNames = params->util.mothurConvert(namesOfSeqsInGroup);
415
416 params->m->mothurOut("/******************************************/\n");
417 params->m->mothurOut("Selecting sequences for group " + params->Groups[i] + "\n\n");
418
419 Command* getCommand = new GetSeqsCommand(thisGroupsNames, uniqueFasta, uniqueList, "", "", params->outputDir);
420
421 map<string, vector<string> > filenames = getCommand->getOutputFiles();
422
423 delete getCommand;
424
425 if (params->fastafile != "") {
426 params->util.renameFile(filenames["fasta"][0], files[0]);
427 params->outputNames.push_back(files[0]); params->outputTypes["fasta"].push_back(files[0]);
428 }
429 if (params->listfile != "") {
430 params->util.renameFile(filenames["list"][0], files[1]);
431 params->outputNames.push_back(files[1]); params->outputTypes["list"].push_back(files[1]);
432 }
433
434 params->m->mothurOut("/******************************************/\n\n");
435
436 if (params->m->getControl_pressed()) { for (int i = 0; i < params->outputNames.size(); i++) { params->util.mothurRemove(params->outputNames[i]); } break; }
437 }
438
439 if (params->fastafile != "") { params->util.mothurRemove(uniqueFasta); }
440 if (params->listfile != "") { params->util.mothurRemove(uniqueList); }
441
442 return 0;
443 }
444 catch(exception& e) {
445 params->m->errorOut(e, "SplitGroupCommand", "driverRunCount");
446 exit(1);
447 }
448 }
449 //**********************************************************************************************************************
splitCountOrGroup(bool isCount)450 void SplitGroupCommand::splitCountOrGroup(bool isCount){
451 try {
452 //create array of worker threads
453 vector<std::thread*> workerThreads;
454 vector<splitGroups2Struct*> data;
455
456 //Lauch worker threads
457 for (int i = 0; i < processors-1; i++) {
458 splitGroups2Struct* dataBundle = new splitGroups2Struct(groupfile, countfile, namefile, Groups, lines[i+1].start, lines[i+1].end);
459 dataBundle->setFiles(fastafile, listfile, outputdir);
460 data.push_back(dataBundle);
461
462 if (isCount) {
463 workerThreads.push_back(new std::thread(driverRunCount, dataBundle));
464 }else {
465 workerThreads.push_back(new std::thread(driverRunNameGroup, dataBundle));
466 }
467 }
468
469 splitGroups2Struct* dataBundle = new splitGroups2Struct(groupfile, countfile, namefile, Groups, lines[0].start, lines[0].end);
470 dataBundle->setFiles(fastafile, listfile, outputdir);
471 if (isCount) {
472 driverRunCount(dataBundle);
473 }else {
474 driverRunNameGroup(dataBundle);
475 }
476 outputNames.insert(outputNames.end(), dataBundle->outputNames.begin(), dataBundle->outputNames.end());
477 for (itTypes = dataBundle->outputTypes.begin(); itTypes != dataBundle->outputTypes.end(); itTypes++) {
478 outputTypes[itTypes->first].insert(outputTypes[itTypes->first].end(), itTypes->second.begin(), itTypes->second.end());
479 }
480
481 delete dataBundle;
482
483 for (int i = 0; i < processors-1; i++) {
484 workerThreads[i]->join();
485
486 outputNames.insert(outputNames.end(), data[i]->outputNames.begin(), data[i]->outputNames.end());
487 for (itTypes = data[i]->outputTypes.begin(); itTypes != data[i]->outputTypes.end(); itTypes++) {
488 outputTypes[itTypes->first].insert(outputTypes[itTypes->first].end(), itTypes->second.begin(), itTypes->second.end());
489 }
490
491 delete data[i];
492 delete workerThreads[i];
493 }
494 }
495 catch(exception& e) {
496 m->errorOut(e, "SplitGroupCommand", "splitCountOrGroup");
497 exit(1);
498 }
499 }
500 //**********************************************************************************************************************
driverSplitFlow(splitGroupsStruct * params)501 int driverSplitFlow(splitGroupsStruct* params){
502 try {
503 GroupMap* groupMap = NULL;
504 CountTable* ct = NULL;
505 vector<string> namesGroups;
506 if (params->groupfile != "") {
507 groupMap = new GroupMap(params->groupfile);
508 groupMap->readMap();
509 namesGroups = groupMap->getNamesOfGroups();
510 }else if (params->countfile != ""){
511 ct = new CountTable();
512 ct->readTable(params->countfile, true, true, params->Groups);
513 namesGroups = ct->getNamesOfGroups();
514 }else { params->m->mothurOut("[ERROR]: you must provide a count or group file to split by group. quitting... \n"); params->m->setControl_pressed(true); }
515
516 if (params->Groups.size() == 0) { params->Groups = namesGroups; }
517
518 if (params->m->getControl_pressed()) { if (groupMap != NULL) { delete groupMap; }else if (ct != NULL) { delete ct; } return 0; }
519
520 string name, flows;
521 int count = 0;
522 ifstream in; params->util.openInputFile(params->inputFileName, in);
523 in >> flows; params->util.gobble(in);
524
525 while (!in.eof()) {
526 if (params->m->getControl_pressed()) { break; }
527
528 in >> name; params->util.gobble(in);
529 flows = params->util.getline(in); params->util.gobble(in);
530
531 vector<string> thisSeqsGroups;
532 if (groupMap != NULL) {
533 string thisGroup = groupMap->getGroup(name);
534 thisSeqsGroups.push_back(thisGroup);
535 }else if (ct != NULL) { thisSeqsGroups = ct->getGroups(name); }
536
537 for (int i = 0; i < thisSeqsGroups.size(); i++) {
538
539 map<string, flowOutput>::iterator it = params->parsedFlowData.find(thisSeqsGroups[i]);
540
541 if (it != params->parsedFlowData.end()) {
542 it->second.total++; it->second.output += name + ' ' + flows + '\n';
543 if (it->second.total % 100 == 0) { //buffer write
544 ofstream out; params->util.openOutputFileAppend(it->second.filename, out);
545 out << it->second.output; it->second.output = ""; out.close();
546 }
547 } //else not in the groups we are looking to parse, so ignore
548 }
549
550 count++;
551 }
552
553 //output rest
554 for (map<string, flowOutput>::iterator it = params->parsedFlowData.begin(); it != params->parsedFlowData.end(); it++) {
555 if (params->m->getControl_pressed()) { break; }
556
557 if (it->second.output != "") { //more seqs to output
558 ofstream out; params->util.openOutputFileAppend(it->second.filename, out);
559 out << it->second.output; it->second.output = ""; params->outputNames.push_back(it->second.filename); params->outputTypes["flow"].push_back(it->second.filename);
560 }else if (it->second.total == 0) { //no seqs for this group, remove file
561 params->util.mothurRemove(it->second.filename);
562 }else { //finished writing, just add to list of output files
563 params->outputNames.push_back(it->second.filename); params->outputTypes["flow"].push_back(it->second.filename);
564 }
565 }
566
567 if (params->m->getControl_pressed()) { if (groupMap != NULL) { delete groupMap; }else if (ct != NULL) { delete ct; } return 0; }
568
569 return count;
570 }
571 catch(exception& e) {
572 params->m->errorOut(e, "SplitGroupCommand", "driverSplitFlow");
573 exit(1);
574 }
575 }
576
577 //**********************************************************************************************************************
driverSplitFastq(splitGroupsStruct * params)578 int driverSplitFastq(splitGroupsStruct* params){
579 try {
580 GroupMap* groupMap = NULL;
581 CountTable* ct = NULL;
582 vector<string> namesGroups;
583 if (params->groupfile != "") {
584 groupMap = new GroupMap(params->groupfile);
585 groupMap->readMap();
586 namesGroups = groupMap->getNamesOfGroups();
587 }else if (params->countfile != ""){
588 ct = new CountTable();
589 ct->readTable(params->countfile, true, true, params->Groups);
590 namesGroups = ct->getNamesOfGroups();
591 }else { params->m->mothurOut("[ERROR]: you must provide a count or group file to split by group. quitting... \n"); params->m->setControl_pressed(true); return 0; }
592
593 if (params->m->getControl_pressed()) { if (groupMap != NULL) { delete groupMap; }else if (ct != NULL) { delete ct; } return 0; }
594
595 int count = 0;
596 ifstream in; params->util.openInputFile(params->inputFileName, in);
597
598 while (!in.eof()) {
599 if (params->m->getControl_pressed()) { break; }
600
601 bool ignore = false;
602 FastqRead thisRead(in, ignore, params->format); params->util.gobble(in);
603 string name = thisRead.getName();
604
605 vector<string> thisSeqsGroups;
606 if (groupMap != NULL) {
607 string thisGroup = groupMap->getGroup(name);
608 thisSeqsGroups.push_back(thisGroup);
609 }else if (ct != NULL) { thisSeqsGroups = ct->getGroups(name); }
610
611 for (int i = 0; i < thisSeqsGroups.size(); i++) {
612
613 map<string, fastqOutput>::iterator it = params->parsedFastqData.find(thisSeqsGroups[i]);
614
615 if (it != params->parsedFastqData.end()) {
616 it->second.total++; it->second.output.push_back(thisRead);
617 if (it->second.total % 500 == 0) { //buffer write
618 ofstream out; params->util.openOutputFileAppend(it->second.filename, out);
619 for (int j = 0; j < it->second.output.size(); j++) { it->second.output[j].printFastq(out); }
620 it->second.output.clear(); out.close();
621 }
622 } //else not in the groups we are looking to parse, so ignore
623 }
624
625 count++;
626 }
627
628 //output rest
629 for (map<string, fastqOutput>::iterator it = params->parsedFastqData.begin(); it != params->parsedFastqData.end(); it++) {
630 if (params->m->getControl_pressed()) { break; }
631
632 if (it->second.output.size() != 0) { //more seqs to output
633 ofstream out; params->util.openOutputFileAppend(it->second.filename, out);
634 for (int j = 0; j < it->second.output.size(); j++) { it->second.output[j].printFastq(out); }
635 it->second.output.clear(); out.close();
636 params->outputNames.push_back(it->second.filename); params->outputTypes["fastq"].push_back(it->second.filename);
637 }else if (it->second.total == 0) { //no seqs for this group, remove file
638 params->util.mothurRemove(it->second.filename);
639 }else { //finished writing, just add to list of output files
640 params->outputNames.push_back(it->second.filename); params->outputTypes["fastq"].push_back(it->second.filename);
641 }
642 }
643
644 if (params->m->getControl_pressed()) { if (groupMap != NULL) { delete groupMap; }else if (ct != NULL) { delete ct; } return 0; }
645
646 return count;
647
648 }
649 catch(exception& e) {
650 params->m->errorOut(e, "SplitGroupCommand", "driverSplitFastq");
651 exit(1);
652 }
653 }
654 //**********************************************************************************************************************
splitFastqOrFlow(string inputFile,string extension)655 void SplitGroupCommand::splitFastqOrFlow(string inputFile, string extension){
656 try {
657 //create array of worker threads
658 vector<std::thread*> workerThreads;
659 vector<splitGroupsStruct*> data;
660
661 string outputfileRoot = outputdir + util.getRootName(util.getSimpleName(inputFile));
662
663 //Lauch worker threads
664 for (int i = 0; i < processors-1; i++) {
665 splitGroupsStruct* dataBundle = new splitGroupsStruct(groupfile, countfile, namefile, Groups, lines[i+1].start, lines[i+1].end);
666 dataBundle->setFiles(inputFile, outputfileRoot, extension);
667 dataBundle->setFormat(format);
668 data.push_back(dataBundle);
669
670 if (extension == ".fastq") {
671 workerThreads.push_back(new std::thread(driverSplitFastq, dataBundle));
672 }else {
673 workerThreads.push_back(new std::thread(driverSplitFlow, dataBundle));
674 }
675 }
676
677 splitGroupsStruct* dataBundle = new splitGroupsStruct(groupfile, countfile, namefile, Groups, lines[0].start, lines[0].end);
678 dataBundle->setFiles(inputFile, outputfileRoot, extension);
679 dataBundle->setFormat(format);
680
681 if (extension == ".fastq") { driverSplitFastq(dataBundle); }
682 else { driverSplitFlow(dataBundle); }
683
684 outputNames.insert(outputNames.end(), dataBundle->outputNames.begin(), dataBundle->outputNames.end());
685 for (itTypes = dataBundle->outputTypes.begin(); itTypes != dataBundle->outputTypes.end(); itTypes++) {
686 outputTypes[itTypes->first].insert(outputTypes[itTypes->first].end(), itTypes->second.begin(), itTypes->second.end());
687 }
688
689 for (int i = 0; i < processors-1; i++) {
690 workerThreads[i]->join();
691
692 outputNames.insert(outputNames.end(), data[i]->outputNames.begin(), data[i]->outputNames.end());
693 for (itTypes = data[i]->outputTypes.begin(); itTypes != data[i]->outputTypes.end(); itTypes++) {
694 outputTypes[itTypes->first].insert(outputTypes[itTypes->first].end(), itTypes->second.begin(), itTypes->second.end());
695 }
696
697 delete data[i];
698 delete workerThreads[i];
699 }
700
701 delete dataBundle;
702 }
703 catch(exception& e) {
704 m->errorOut(e, "SplitGroupCommand", "splitFastqOrFlow");
705 exit(1);
706 }
707 }
708 //**********************************************************************************************************************
709
710
711
712