1 /*
2 * getseqscommand.cpp
3 * Mothur
4 *
5 * Created by Sarah Westcott on 7/8/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7 *
8 */
9
10 #include "getseqscommand.h"
11 #include "sequence.hpp"
12 #include "listvector.hpp"
13 #include "counttable.h"
14 #include "fastqread.h"
15 #include "inputdata.h"
16 #include "contigsreport.hpp"
17 #include "alignreport.hpp"
18
19 //**********************************************************************************************************************
setParameters()20 vector<string> GetSeqsCommand::setParameters(){
21 try {
22 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "FNGLT", "none","fasta",false,false,true); parameters.push_back(pfasta);
23 CommandParameter pfastq("fastq", "InputTypes", "", "", "none", "FNGLT", "none","fastq",false,false,true); parameters.push_back(pfastq);
24 CommandParameter pname("name", "InputTypes", "", "", "NameCount", "FNGLT", "none","name",false,false,true); parameters.push_back(pname);
25 CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "FNGLT", "none","count",false,false,true); parameters.push_back(pcount);
26 CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "FNGLT", "none","group",false,false,true); parameters.push_back(pgroup);
27 CommandParameter plist("list", "InputTypes", "", "", "none", "FNGLT", "none","list",false,false,true); parameters.push_back(plist);
28 CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "FNGLT", "none","taxonomy",false,false,true); parameters.push_back(ptaxonomy);
29 CommandParameter palignreport("alignreport", "InputTypes", "", "", "none", "FNGLT", "none","alignreport",false,false); parameters.push_back(palignreport);
30 CommandParameter pcontigsreport("contigsreport", "InputTypes", "", "", "FNGLT", "FNGLT", "none","contigsreport",false,false); parameters.push_back(pcontigsreport);
31 CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "FNGLT", "none","qfile",false,false); parameters.push_back(pqfile);
32 CommandParameter paccnos("accnos", "InputTypes", "", "", "none", "none", "none","",false,true,true); parameters.push_back(paccnos);
33 CommandParameter pdups("dups", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pdups);
34 CommandParameter pseed("seed", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pseed);
35 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
36 CommandParameter pformat("format", "Multiple", "sanger-illumina-solexa-illumina1.8+", "illumina1.8+", "", "", "","",false,false,true); parameters.push_back(pformat);
37 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
38 CommandParameter paccnos2("accnos2", "InputTypes", "", "", "none", "none", "none","",false,false); parameters.push_back(paccnos2);
39
40 abort = false; calledHelp = false;
41
42 vector<string> tempOutNames;
43 outputTypes["fasta"] = tempOutNames;
44 outputTypes["fastq"] = tempOutNames;
45 outputTypes["taxonomy"] = tempOutNames;
46 outputTypes["name"] = tempOutNames;
47 outputTypes["group"] = tempOutNames;
48 outputTypes["alignreport"] = tempOutNames;
49 outputTypes["contigsreport"] = tempOutNames;
50 outputTypes["list"] = tempOutNames;
51 outputTypes["qfile"] = tempOutNames;
52 outputTypes["count"] = tempOutNames;
53 outputTypes["accnosreport"] = tempOutNames;
54
55 vector<string> myArray;
56 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
57 return myArray;
58 }
59 catch(exception& e) {
60 m->errorOut(e, "GetSeqsCommand", "setParameters");
61 exit(1);
62 }
63 }
64 //**********************************************************************************************************************
getHelpString()65 string GetSeqsCommand::getHelpString(){
66 try {
67 string helpString = "";
68 helpString += "The get.seqs command reads an .accnos file and any of the following file types: fasta, name, group, count, list, taxonomy, quality, fastq, contigsreport or alignreport file.\n";
69 helpString += "It outputs a file containing only the sequences in the .accnos file.\n";
70 helpString += "The get.seqs command parameters are accnos, fasta, name, group, list, taxonomy, qfile, alignreport, contigsreport, fastq and dups. You must provide accnos unless you have a valid current accnos file, and at least one of the other parameters.\n";
71 helpString += "The dups parameter allows you to add the entire line from a name file if you add any name from the line. default=true. \n";
72 helpString += "The format parameter is used to indicate whether your sequences are sanger, solexa, illumina1.8+ or illumina, default=illumina1.8+.\n";
73 helpString += "The get.seqs command should be in the following format: get.seqs(accnos=yourAccnos, fasta=yourFasta).\n";
74 helpString += "Example get.seqs(accnos=amazon.accnos, fasta=amazon.fasta).\n";
75 ;
76 return helpString;
77 }
78 catch(exception& e) {
79 m->errorOut(e, "GetSeqsCommand", "getHelpString");
80 exit(1);
81 }
82 }
83 //**********************************************************************************************************************
getOutputPattern(string type)84 string GetSeqsCommand::getOutputPattern(string type) {
85 try {
86 string pattern = "";
87
88 if (type == "fasta") { pattern = "[filename],pick,[extension]"; }
89 else if (type == "fastq") { pattern = "[filename],pick,[extension]"; }
90 else if (type == "taxonomy") { pattern = "[filename],pick,[extension]"; }
91 else if (type == "name") { pattern = "[filename],pick,[extension]"; }
92 else if (type == "group") { pattern = "[filename],pick,[extension]"; }
93 else if (type == "count") { pattern = "[filename],pick,[extension]"; }
94 else if (type == "list") { pattern = "[filename],[distance],pick,[extension]"; }
95 else if (type == "qfile") { pattern = "[filename],pick,[extension]"; }
96 else if (type == "accnosreport") { pattern = "[filename],pick.accnos.report"; }
97 else if (type == "alignreport") { pattern = "[filename],pick.align.report"; }
98 else if (type == "contigsreport") { pattern = "[filename],pick.contigs.report"; }
99 else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->setControl_pressed(true); }
100
101 return pattern;
102 }
103 catch(exception& e) {
104 m->errorOut(e, "GetSeqsCommand", "getOutputPattern");
105 exit(1);
106 }
107 }
108 //**********************************************************************************************************************
109
GetSeqsCommand(set<string> n,string ffile,string lfile,string dupsFile,string dupsFileType,string output)110 GetSeqsCommand::GetSeqsCommand(set<string> n, string ffile, string lfile, string dupsFile, string dupsFileType, string output) : Command() {
111 try {
112 names = n;
113 dups = true;
114 outputdir = output;
115 fastafile = ffile;
116 listfile = lfile;
117
118 abort = false; calledHelp = false;
119 vector<string> tempOutNames;
120 outputTypes["name"] = tempOutNames;
121 outputTypes["count"] = tempOutNames;
122 outputTypes["fasta"] = tempOutNames;
123
124 if (dupsFile != "") {
125 if (dupsFileType == "count") {
126 countfile = dupsFile;
127 readCount();
128 }else { //names
129 namefile = dupsFile;
130 readName();
131 }
132 }
133
134 if (fastafile != "") { readFasta(); }
135 if (listfile != "") { readList(); }
136
137 }
138 catch(exception& e) {
139 m->errorOut(e, "GetSeqsCommand", "GetSeqsCommand - mothurRun");
140 exit(1);
141 }
142 }
143
144 //**********************************************************************************************************************
GetSeqsCommand(string option)145 GetSeqsCommand::GetSeqsCommand(string option) : Command() {
146 try {
147 //allow user to run help
148 if(option == "help") { help(); abort = true; calledHelp = true; }
149 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
150 else if(option == "category") { abort = true; calledHelp = true; }
151
152 else {
153 OptionParser parser(option, setParameters());
154 map<string,string> parameters = parser.getParameters();
155
156 ValidParameters validParameter;
157
158 //check for required parameters
159 accnosfile = validParameter.validFile(parameters, "accnos");
160 if (accnosfile == "not open") { abort = true; }
161 else if (accnosfile == "not found") {
162 accnosfile = current->getAccnosFile();
163 if (accnosfile != "") { m->mothurOut("Using " + accnosfile + " as input file for the accnos parameter.\n"); }
164 else {
165 m->mothurOut("You have no valid accnos file and accnos is required.\n"); abort = true;
166 }
167 }else { current->setAccnosFile(accnosfile); }
168
169 if (accnosfile2 == "not found") { accnosfile2 = ""; }
170
171 fastafile = validParameter.validFile(parameters, "fasta");
172 if (fastafile == "not open") { fastafile = ""; abort = true; }
173 else if (fastafile == "not found") { fastafile = ""; }
174 else { current->setFastaFile(fastafile); }
175
176 namefile = validParameter.validFile(parameters, "name");
177 if (namefile == "not open") { namefile = ""; abort = true; }
178 else if (namefile == "not found") { namefile = ""; }
179 else { current->setNameFile(namefile); }
180
181 groupfile = validParameter.validFile(parameters, "group");
182 if (groupfile == "not open") { abort = true; }
183 else if (groupfile == "not found") { groupfile = ""; }
184 else { current->setGroupFile(groupfile); }
185
186 alignfile = validParameter.validFile(parameters, "alignreport");
187 if (alignfile == "not open") { abort = true; }
188 else if (alignfile == "not found") { alignfile = ""; }
189
190 contigsreportfile = validParameter.validFile(parameters, "contigsreport");
191 if (contigsreportfile == "not open") { abort = true; }
192 else if (contigsreportfile == "not found") { contigsreportfile = ""; }
193 else { current->setContigsReportFile(contigsreportfile); }
194
195 listfile = validParameter.validFile(parameters, "list");
196 if (listfile == "not open") { abort = true; }
197 else if (listfile == "not found") { listfile = ""; }
198 else { current->setListFile(listfile); }
199
200 taxfile = validParameter.validFile(parameters, "taxonomy");
201 if (taxfile == "not open") { taxfile = ""; abort = true; }
202 else if (taxfile == "not found") { taxfile = ""; }
203 else { current->setTaxonomyFile(taxfile); }
204
205 qualfile = validParameter.validFile(parameters, "qfile");
206 if (qualfile == "not open") { abort = true; }
207 else if (qualfile == "not found") { qualfile = ""; }
208 else { current->setQualFile(qualfile); }
209
210 fastqfile = validParameter.validFile(parameters, "fastq");
211 if (fastqfile == "not open") { abort = true; }
212 else if (fastqfile == "not found") { fastqfile = ""; }
213
214 accnosfile2 = validParameter.validFile(parameters, "accnos2");
215 if (accnosfile2 == "not open") { abort = true; }
216 else if (accnosfile2 == "not found") { accnosfile2 = ""; }
217
218 countfile = validParameter.validFile(parameters, "count");
219 if (countfile == "not open") { countfile = ""; abort = true; }
220 else if (countfile == "not found") { countfile = ""; }
221 else { current->setCountFile(countfile); }
222
223 if ((namefile != "") && (countfile != "")) {
224 m->mothurOut("[ERROR]: you may only use one of the following: name or count.\n"); abort = true;
225 }
226
227 if ((groupfile != "") && (countfile != "")) {
228 m->mothurOut("[ERROR]: you may only use one of the following: group or count.\n"); abort=true;
229 }
230
231
232 string usedDups = "true";
233 string temp = validParameter.valid(parameters, "dups"); if (temp == "not found") { temp = "true"; usedDups = ""; }
234 dups = util.isTrue(temp);
235
236 format = validParameter.valid(parameters, "format"); if (format == "not found"){ format = "illumina1.8+"; }
237
238 if ((format != "sanger") && (format != "illumina") && (format != "illumina1.8+") && (format != "solexa")) {
239 m->mothurOut(format + " is not a valid format. Your format choices are sanger, solexa, illumina1.8+ and illumina, aborting.\n" );
240 abort=true;
241 }
242
243 if ((fastqfile == "") && (fastafile == "") && (namefile == "") && (groupfile == "") && (alignfile == "") && (listfile == "") && (taxfile == "") && (qualfile == "") && (accnosfile2 == "") && (countfile == "") && (contigsreportfile == "")) { m->mothurOut("You must provide one of the following: fasta, name, group, count, alignreport, contigsreport, taxonomy, quality, fastq or listfile.\n"); abort = true; }
244
245 //read accnos file
246 if (!abort) { names = util.readAccnos(accnosfile); }
247 }
248 }
249 catch(exception& e) {
250 m->errorOut(e, "GetSeqsCommand", "GetSeqsCommand");
251 exit(1);
252 }
253 }
254 //**********************************************************************************************************************
255
execute()256 int GetSeqsCommand::execute(){
257 try {
258
259 if (abort) { if (calledHelp) { return 0; } return 2; }
260
261 //read through the correct file and output lines you want to keep
262 if (namefile != "") { readName(); }
263 if (fastafile != "") { readFasta(); }
264 if (fastqfile != "") { readFastq(); }
265 if (groupfile != "") { readGroup(); }
266 if (countfile != "") { readCount(); }
267 if (alignfile != "") { readAlign(); }
268 if (contigsreportfile != "") { readContigs(); }
269 if (listfile != "") { readList(); }
270 if (taxfile != "") { readTax(); }
271 if (qualfile != "") { readQual(); }
272 if (accnosfile2 != "") { compareAccnos(); }
273
274 if (m->getDebug()) { runSanityCheck(); }
275
276 if (m->getControl_pressed()) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) { util.mothurRemove(outputNames[i]); } return 0; }
277
278
279 if (outputNames.size() != 0) {
280 m->mothurOut("\nOutput File Names:\n");
281 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]+"\n"); }
282 m->mothurOutEndLine();
283
284 //set fasta file as new current fastafile
285 string currentName = "";
286 itTypes = outputTypes.find("fasta");
287 if (itTypes != outputTypes.end()) {
288 if ((itTypes->second).size() != 0) { currentName = (itTypes->second)[0]; current->setFastaFile(currentName); }
289 }
290
291 itTypes = outputTypes.find("name");
292 if (itTypes != outputTypes.end()) {
293 if ((itTypes->second).size() != 0) { currentName = (itTypes->second)[0]; current->setNameFile(currentName); }
294 }
295
296 itTypes = outputTypes.find("group");
297 if (itTypes != outputTypes.end()) {
298 if ((itTypes->second).size() != 0) { currentName = (itTypes->second)[0]; current->setGroupFile(currentName); }
299 }
300
301 itTypes = outputTypes.find("list");
302 if (itTypes != outputTypes.end()) {
303 if ((itTypes->second).size() != 0) { currentName = (itTypes->second)[0]; current->setListFile(currentName); }
304 }
305
306 itTypes = outputTypes.find("taxonomy");
307 if (itTypes != outputTypes.end()) {
308 if ((itTypes->second).size() != 0) { currentName = (itTypes->second)[0]; current->setTaxonomyFile(currentName); }
309 }
310
311 itTypes = outputTypes.find("qfile");
312 if (itTypes != outputTypes.end()) {
313 if ((itTypes->second).size() != 0) { currentName = (itTypes->second)[0]; current->setQualFile(currentName); }
314 }
315
316 itTypes = outputTypes.find("count");
317 if (itTypes != outputTypes.end()) {
318 if ((itTypes->second).size() != 0) { currentName = (itTypes->second)[0]; current->setCountFile(currentName); }
319 }
320 }
321
322 return 0;
323 }
324
325 catch(exception& e) {
326 m->errorOut(e, "GetSeqsCommand", "execute");
327 exit(1);
328 }
329 }
330 //**********************************************************************************************************************
readFastq()331 void GetSeqsCommand::readFastq(){
332 try {
333 bool wroteSomething = false;
334 int selectedCount = 0;
335
336 ifstream in; util.openInputFile(fastqfile, in);
337
338 string thisOutputDir = outputdir;
339 if (outputdir == "") { thisOutputDir += util.hasPath(fastqfile); }
340 map<string, string> variables;
341 variables["[filename]"] = thisOutputDir + util.getRootName(util.getSimpleName(fastqfile));
342 variables["[extension]"] = util.getExtension(fastqfile);
343 string outputFileName = getOutputFileName("fastq", variables);
344 ofstream out;
345 util.openOutputFile(outputFileName, out);
346
347 set<string> uniqueNames;
348 while(!in.eof()){
349
350 if (m->getControl_pressed()) { in.close(); out.close(); util.mothurRemove(outputFileName); return; }
351
352 //read sequence name
353 bool ignore;
354 FastqRead fread(in, ignore, format); util.gobble(in);
355
356 if (!ignore) {
357 string name = fread.getName();
358
359 if (names.count(name) != 0) {
360 if (uniqueNames.count(name) == 0) { //this name hasn't been seen yet
361 wroteSomething = true;
362 selectedCount++;
363 fread.printFastq(out);
364 uniqueNames.insert(name);
365 }else {
366 m->mothurOut("[WARNING]: " + name + " is in your fastq file more than once. Mothur requires sequence names to be unique. I will only add it once.\n");
367 }
368 }
369 }
370
371 util.gobble(in);
372 }
373 in.close();
374 out.close();
375
376 if (wroteSomething == false) { m->mothurOut("Your file does not contain any sequence from the .accnos file.\n"); }
377 outputNames.push_back(outputFileName); outputTypes["fastq"].push_back(outputFileName);
378
379 m->mothurOut("Selected " + toString(selectedCount) + " sequences from your fastq file.\n");
380
381 return;
382 }
383 catch(exception& e) {
384 m->errorOut(e, "GetSeqsCommand", "readFastq");
385 exit(1);
386 }
387 }
388 //**********************************************************************************************************************
readFasta()389 void GetSeqsCommand::readFasta(){
390 try {
391 string thisOutputDir = outputdir;
392 if (outputdir == "") { thisOutputDir += util.hasPath(fastafile); }
393 map<string, string> variables;
394 variables["[filename]"] = thisOutputDir + util.getRootName(util.getSimpleName(fastafile));
395 variables["[extension]"] = util.getExtension(fastafile);
396 string outputFileName = getOutputFileName("fasta", variables);
397 ofstream out; util.openOutputFile(outputFileName, out);
398
399 ifstream in; util.openInputFile(fastafile, in);
400 string name;
401
402 bool wroteSomething = false;
403 int selectedCount = 0;
404
405 if (m->getDebug()) { set<string> temp; sanity["fasta"] = temp; }
406
407 set<string> uniqueNames; int line = 0; int redundNum = 0;
408 while(!in.eof()){
409
410 if (m->getControl_pressed()) { in.close(); out.close(); util.mothurRemove(outputFileName); return; }
411
412 Sequence currSeq(in);
413 name = currSeq.getName();
414
415 if (!dups) {//adjust name if needed
416 map<string, string>::iterator it = uniqueMap.find(name);
417 if (it != uniqueMap.end()) { currSeq.setName(it->second); }
418 }
419
420 name = currSeq.getName();
421
422 if (name != "") {
423 line++;
424 //if this name is in the accnos file
425 if (names.count(name) != 0) {
426 if (uniqueNames.count(name) == 0) { //this name hasn't been seen yet
427 wroteSomething = true;
428
429 currSeq.printSequence(out);
430 selectedCount++;
431 uniqueNames.insert(name);
432
433 if (m->getDebug()) { sanity["fasta"].insert(name); }
434 }else {
435 m->mothurOut("[WARNING]: " + name + " is in your fasta file more than once. Mothur requires sequence names to be unique. I will only add it once.\n");
436 redundNum++;
437 }
438 }
439 }
440 util.gobble(in);
441 }
442 in.close();
443 out.close();
444
445 if (wroteSomething == false) { m->mothurOut("Your file does not contain any sequence from the .accnos file.\n"); }
446 outputNames.push_back(outputFileName); outputTypes["fasta"].push_back(outputFileName);
447
448 m->mothurOut("Selected " + toString(selectedCount) + " sequences from your fasta file.\n");
449
450 return;
451 }
452 catch(exception& e) {
453 m->errorOut(e, "GetSeqsCommand", "readFasta");
454 exit(1);
455 }
456 }
457 //**********************************************************************************************************************
readQual()458 void GetSeqsCommand::readQual(){
459 try {
460 string thisOutputDir = outputdir;
461 if (outputdir == "") { thisOutputDir += util.hasPath(qualfile); }
462 map<string, string> variables;
463 variables["[filename]"] = thisOutputDir + util.getRootName(util.getSimpleName(qualfile));
464 variables["[extension]"] = util.getExtension(qualfile);
465 string outputFileName = getOutputFileName("qfile", variables);
466 ofstream out;
467 util.openOutputFile(outputFileName, out);
468
469 ifstream in;
470 util.openInputFile(qualfile, in);
471 string name;
472
473 bool wroteSomething = false;
474 int selectedCount = 0;
475
476 if (m->getDebug()) { set<string> temp; sanity["qual"] = temp; }
477
478 set<string> uniqueNames;
479 while(!in.eof()){
480
481 QualityScores qual(in); util.gobble(in);
482
483 if (!dups) {//adjust name if needed
484 map<string, string>::iterator it = uniqueMap.find(qual.getName());
485 if (it != uniqueMap.end()) { qual.setName(it->second); }
486 }
487
488 string name = qual.getName();
489
490 if (names.count(name) != 0) {
491 if (uniqueNames.count(name) == 0) { //this name hasn't been seen yet
492 uniqueNames.insert(name);
493 wroteSomething = true;
494
495 qual.printQScores(out);
496 selectedCount++;
497 if (m->getDebug()) { sanity["qual"].insert(name); }
498 }else {
499 m->mothurOut("[WARNING]: " + name + " is in your qfile more than once. Mothur requires sequence names to be unique. I will only add it once.\n");
500 }
501 }
502
503 util.gobble(in);
504 }
505 in.close();
506 out.close();
507
508 if (wroteSomething == false) { m->mothurOut("Your file does not contain any sequence from the .accnos file.\n"); }
509 outputNames.push_back(outputFileName); outputTypes["qfile"].push_back(outputFileName);
510
511 m->mothurOut("Selected " + toString(selectedCount) + " sequences from your quality file.\n");
512
513 return;
514 }
515 catch(exception& e) {
516 m->errorOut(e, "GetSeqsCommand", "readQual");
517 exit(1);
518 }
519 }
520 //**********************************************************************************************************************
readCount()521 void GetSeqsCommand::readCount(){
522 try {
523 string thisOutputDir = outputdir;
524 if (outputdir == "") { thisOutputDir += util.hasPath(countfile); }
525 map<string, string> variables;
526 variables["[filename]"] = thisOutputDir + util.getRootName(util.getSimpleName(countfile));
527 variables["[extension]"] = util.getExtension(countfile);
528 string outputFileName = getOutputFileName("count", variables);
529
530 CountTable ct; ct.readTable(countfile, true, false, names);
531
532 bool wroteSomething = false;
533 int selectedCount = ct.getNumSeqs();
534 if (selectedCount != 0) { wroteSomething = true; }
535
536 ct.printTable(outputFileName);
537
538 if (wroteSomething == false) { m->mothurOut("Your file does not contain any sequence from the .accnos file.\n"); }
539 outputTypes["count"].push_back(outputFileName); outputNames.push_back(outputFileName);
540
541 m->mothurOut("Selected " + toString(selectedCount) + " sequences from your count file.\n");
542
543 return;
544 }
545 catch(exception& e) {
546 m->errorOut(e, "GetSeqsCommand", "readCount");
547 exit(1);
548 }
549 }
550 //**********************************************************************************************************************
readList()551 void GetSeqsCommand::readList(){
552 try {
553 string thisOutputDir = outputdir;
554 if (outputdir == "") { thisOutputDir += util.hasPath(listfile); }
555 map<string, string> variables;
556 variables["[filename]"] = thisOutputDir + util.getRootName(util.getSimpleName(listfile));
557 variables["[extension]"] = util.getExtension(listfile);
558 InputData input(listfile, "list", nullVector);
559 ListVector* list = input.getListVector();
560
561 bool wroteSomething = false;
562 int selectedCount = 0;
563
564 if (m->getDebug()) { set<string> temp; sanity["list"] = temp; }
565
566 while(list != NULL) {
567
568 set<string> uniqueNames;
569 selectedCount = 0;
570
571 //make a new list vector
572 ListVector newList;
573 newList.setLabel(list->getLabel());
574
575 variables["[distance]"] = list->getLabel();
576 string outputFileName = getOutputFileName("list", variables);
577
578 ofstream out;
579 util.openOutputFile(outputFileName, out);
580 outputTypes["list"].push_back(outputFileName); outputNames.push_back(outputFileName);
581
582 vector<string> binLabels = list->getLabels();
583 vector<string> newBinLabels;
584
585 if (m->getControl_pressed()) { out.close(); return; }
586
587 //for each bin
588 for (int i = 0; i < list->getNumBins(); i++) {
589
590 //parse out names that are in accnos file
591 string binnames = list->get(i);
592 vector<string> bnames;
593 util.splitAtComma(binnames, bnames);
594
595 string newNames = "";
596 for (int j = 0; j < bnames.size(); j++) {
597 string name = bnames[j];
598
599 //if that name is in the .accnos file, add it
600 if (names.count(name) != 0) {
601 if (uniqueNames.count(name) == 0) { //this name hasn't been seen yet
602 uniqueNames.insert(name);
603 newNames += name + ",";
604 selectedCount++;
605 if (m->getDebug()) { sanity["list"].insert(name); }
606 }else {
607 m->mothurOut("[WARNING]: " + name + " is in your list file more than once. Mothur requires sequence names to be unique. I will only add it once.\n");
608 }
609 }
610 }
611
612 //if there are names in this bin add to new list
613 if (newNames != "") {
614 newNames = newNames.substr(0, newNames.length()-1); //rip off extra comma
615 newList.push_back(newNames);
616 newBinLabels.push_back(binLabels[i]);
617 }
618 }
619
620 //print new listvector
621 if (newList.getNumBins() != 0) {
622 wroteSomething = true;
623 newList.setLabels(newBinLabels);
624 newList.print(out, false);
625 }
626
627
628 out.close();
629
630 delete list;
631 list = input.getListVector();
632 }
633
634
635 if (wroteSomething == false) { m->mothurOut("Your file does not contain any sequence from the .accnos file.\n"); }
636
637 m->mothurOut("Selected " + toString(selectedCount) + " sequences from your list file.\n");
638
639 return;
640 }
641 catch(exception& e) {
642 m->errorOut(e, "GetSeqsCommand", "readList");
643 exit(1);
644 }
645 }
646 //**********************************************************************************************************************
readName()647 void GetSeqsCommand::readName(){
648 try {
649 string thisOutputDir = outputdir;
650 if (outputdir == "") { thisOutputDir += util.hasPath(namefile); }
651 map<string, string> variables;
652 variables["[filename]"] = thisOutputDir + util.getRootName(util.getSimpleName(namefile));
653 variables["[extension]"] = util.getExtension(namefile);
654 string outputFileName = getOutputFileName("name", variables);
655 ofstream out; util.openOutputFile(outputFileName, out);
656
657 ifstream in; util.openInputFile(namefile, in);
658 string name, firstCol, secondCol;
659
660 bool wroteSomething = false;
661 int selectedCount = 0;
662
663 if (m->getDebug()) { set<string> temp; sanity["name"] = temp; }
664 if (m->getDebug()) { set<string> temp; sanity["dupname"] = temp; }
665
666 set<string> uniqueNames;
667 while(!in.eof()){
668
669 if (m->getControl_pressed()) { in.close(); out.close(); util.mothurRemove(outputFileName); return; }
670
671 in >> firstCol; util.gobble(in);
672 in >> secondCol; util.gobble(in);
673
674 string hold = "";
675 if (dups) { hold = secondCol; }
676
677 vector<string> parsedNames;
678 util.splitAtComma(secondCol, parsedNames);
679
680 vector<string> validSecond; vector<string> parsedNames2;
681 bool parsedError = false;
682 for (int i = 0; i < parsedNames.size(); i++) {
683 if (names.count(parsedNames[i]) != 0) {
684 if (uniqueNames.count(parsedNames[i]) == 0) { //this name hasn't been seen yet
685 uniqueNames.insert(parsedNames[i]);
686 validSecond.push_back(parsedNames[i]);
687 parsedNames2.push_back(parsedNames[i]);
688 if (m->getDebug()) { sanity["dupname"].insert(parsedNames[i]); }
689 }else {
690 m->mothurOut("[WARNING]: " + parsedNames[i] + " is in your name file more than once. Mothur requires sequence names to be unique. I will only add it once.\n");
691 parsedError = true;
692 }
693 }
694 }
695 if (parsedError) {
696 parsedNames = parsedNames2;
697 hold = "";
698 if (parsedNames.size() != 0) {
699 for (int i = 0; i < parsedNames.size()-1; i++) { hold += parsedNames[i] + ','; }
700 hold += parsedNames[parsedNames.size()-1] + '\n';
701 }
702 }
703
704 if (dups && (validSecond.size() != 0)) { //dups = true and we want to add someone, then add everyone
705 for (int i = 0; i < parsedNames.size(); i++) { names.insert(parsedNames[i]); if (m->getDebug()) { sanity["dupname"].insert(parsedNames[i]); } }
706 out << firstCol << '\t' << hold << endl;
707 wroteSomething = true;
708 selectedCount += parsedNames.size();
709 if (m->getDebug()) { sanity["name"].insert(firstCol); }
710 }else {
711
712 if (validSecond.size() != 0) {
713 selectedCount += validSecond.size();
714
715 //if the name in the first column is in the set then print it and any other names in second column also in set
716 if (names.count(firstCol) != 0) {
717
718 wroteSomething = true;
719
720 out << firstCol << '\t';
721
722 //you know you have at least one valid second since first column is valid
723 for (int i = 0; i < validSecond.size()-1; i++) { out << validSecond[i] << ','; }
724 out << validSecond[validSecond.size()-1] << endl;
725
726 if (m->getDebug()) { sanity["name"].insert(firstCol); }
727
728
729 //make first name in set you come to first column and then add the remaining names to second column
730 }else {
731
732 //you want part of this row
733 if (validSecond.size() != 0) {
734
735 wroteSomething = true;
736
737 out << validSecond[0] << '\t';
738 //we are changing the unique name in the fasta file
739 uniqueMap[firstCol] = validSecond[0];
740
741 //you know you have at least one valid second since first column is valid
742 for (int i = 0; i < validSecond.size()-1; i++) { out << validSecond[i] << ','; }
743 out << validSecond[validSecond.size()-1] << endl;
744
745 if (m->getDebug()) { sanity["name"].insert(validSecond[0]); }
746 }
747 }
748 }
749 }
750 }
751 in.close(); out.close();
752
753 if (wroteSomething == false) { m->mothurOut("Your file does not contain any sequence from the .accnos file.\n"); }
754 outputNames.push_back(outputFileName); outputTypes["name"].push_back(outputFileName);
755
756 m->mothurOut("Selected " + toString(selectedCount) + " sequences from your name file.\n");
757
758 return;
759 }
760 catch(exception& e) {
761 m->errorOut(e, "GetSeqsCommand", "readName");
762 exit(1);
763 }
764 }
765 //**********************************************************************************************************************
readGroup()766 void GetSeqsCommand::readGroup(){
767 try {
768 string thisOutputDir = outputdir;
769 if (outputdir == "") { thisOutputDir += util.hasPath(groupfile); }
770 map<string, string> variables;
771 variables["[filename]"] = thisOutputDir + util.getRootName(util.getSimpleName(groupfile));
772 variables["[extension]"] = util.getExtension(groupfile);
773 string outputFileName = getOutputFileName("group", variables);
774 ofstream out;
775 util.openOutputFile(outputFileName, out);
776
777
778 ifstream in;
779 util.openInputFile(groupfile, in);
780 string name, group;
781
782 bool wroteSomething = false;
783 int selectedCount = 0;
784
785 if (m->getDebug()) { set<string> temp; sanity["group"] = temp; }
786
787 set<string> uniqueNames;
788 while(!in.eof()){
789
790 if (m->getControl_pressed()) { in.close(); out.close(); util.mothurRemove(outputFileName); return; }
791
792
793 in >> name; util.gobble(in); //read from first column
794 in >> group; //read from second column
795
796
797 if (names.count(name) != 0) {
798 if (uniqueNames.count(name) == 0) { //this name hasn't been seen yet
799 uniqueNames.insert(name);
800 wroteSomething = true;
801
802 out << name << '\t' << group << endl;
803 selectedCount++;
804
805 if (m->getDebug()) { sanity["group"].insert(name); }
806 }else {
807 m->mothurOut("[WARNING]: " + name + " is in your group file more than once. Mothur requires sequence names to be unique. I will only add it once.\n");
808 }
809 }
810
811 util.gobble(in);
812 }
813 in.close();
814 out.close();
815
816 if (wroteSomething == false) { m->mothurOut("Your file does not contain any sequence from the .accnos file.\n"); }
817 outputNames.push_back(outputFileName); outputTypes["group"].push_back(outputFileName);
818
819 m->mothurOut("Selected " + toString(selectedCount) + " sequences from your group file.\n");
820
821 return;
822 }
823 catch(exception& e) {
824 m->errorOut(e, "GetSeqsCommand", "readGroup");
825 exit(1);
826 }
827 }
828 //**********************************************************************************************************************
readTax()829 void GetSeqsCommand::readTax(){
830 try {
831 string thisOutputDir = outputdir;
832 if (outputdir == "") { thisOutputDir += util.hasPath(taxfile); }
833 map<string, string> variables;
834 variables["[filename]"] = thisOutputDir + util.getRootName(util.getSimpleName(taxfile));
835 variables["[extension]"] = util.getExtension(taxfile);
836 string outputFileName = getOutputFileName("taxonomy", variables);
837 ofstream out;
838 util.openOutputFile(outputFileName, out);
839
840 ifstream in;
841 util.openInputFile(taxfile, in);
842 string name, tax;
843
844 bool wroteSomething = false;
845 int selectedCount = 0;
846
847 if (m->getDebug()) { set<string> temp; sanity["tax"] = temp; }
848
849 set<string> uniqueNames;
850 while(!in.eof()){
851
852 if (m->getControl_pressed()) { in.close(); out.close(); util.mothurRemove(outputFileName); return; }
853
854 in >> name; util.gobble(in);
855 tax = util.getline(in); util.gobble(in);
856
857 if (!dups) {//adjust name if needed
858 map<string, string>::iterator it = uniqueMap.find(name);
859 if (it != uniqueMap.end()) { name = it->second; }
860 }
861
862 if (names.count(name) != 0) {
863 if (uniqueNames.count(name) == 0) { //this name hasn't been seen yet
864 uniqueNames.insert(name);
865
866 wroteSomething = true;
867
868 out << name << '\t' << tax << endl;
869 selectedCount++;
870
871 if (m->getDebug()) { sanity["tax"].insert(name); }
872 }else {
873 m->mothurOut("[WARNING]: " + name + " is in your taxonomy file more than once. Mothur requires sequence names to be unique. I will only add it once.\n");
874 }
875 }
876 }
877 in.close();
878 out.close();
879
880 if (wroteSomething == false) { m->mothurOut("Your file does not contain any sequence from the .accnos file.\n"); }
881 outputNames.push_back(outputFileName); outputTypes["taxonomy"].push_back(outputFileName);
882
883 m->mothurOut("Selected " + toString(selectedCount) + " sequences from your taxonomy file.\n");
884
885 return;
886
887 }
888 catch(exception& e) {
889 m->errorOut(e, "GetSeqsCommand", "readTax");
890 exit(1);
891 }
892 }
893 //**********************************************************************************************************************
894 //alignreport file has a column header line then all other lines contain 16 columns. we just want the first column since that contains the name
readAlign()895 void GetSeqsCommand::readAlign(){
896 try {
897 string thisOutputDir = outputdir;
898 if (outputdir == "") { thisOutputDir += util.hasPath(alignfile); }
899 map<string, string> variables;
900 variables["[filename]"] = thisOutputDir + util.getRootName(util.getSimpleName(alignfile));
901 string outputFileName = getOutputFileName("alignreport", variables);
902
903 ofstream out; util.openOutputFile(outputFileName, out);
904 ifstream in; util.openInputFile(alignfile, in);
905
906 AlignReport report;
907 report.readHeaders(in); util.gobble(in);
908 report.printHeaders(out);
909
910 bool wroteSomething = false; int selectedCount = 0;
911
912 set<string> uniqueNames;
913 while(!in.eof()){
914
915 if (m->getControl_pressed()) { in.close(); out.close(); util.mothurRemove(outputFileName); return; }
916
917 report.read(in); util.gobble(in);
918 string name = report.getQueryName();
919
920 if (!dups) {//adjust name if needed
921 map<string, string>::iterator it = uniqueMap.find(name);
922 if (it != uniqueMap.end()) { name = it->second; }
923 }
924
925 //if this name is in the accnos file
926 if (names.count(name) != 0) {
927 if (uniqueNames.count(name) == 0) { //this name hasn't been seen yet
928 uniqueNames.insert(name);
929 wroteSomething = true;
930 selectedCount++;
931
932 report.print(out);
933 }else {
934 m->mothurOut("[WARNING]: " + name + " is in your alignreport file more than once. Mothur requires sequence names to be unique. I will only add it once.\n");
935 }
936 }
937 }
938 in.close(); out.close();
939
940 if (wroteSomething == false) { m->mothurOut("Your file does not contain any sequence from the .accnos file.\n"); }
941 outputNames.push_back(outputFileName); outputTypes["alignreport"].push_back(outputFileName);
942
943 m->mothurOut("Selected " + toString(selectedCount) + " sequences from your alignreport file.\n");
944
945 return;
946 }
947 catch(exception& e) {
948 m->errorOut(e, "GetSeqsCommand", "readAlign");
949 exit(1);
950 }
951 }
952 //**********************************************************************************************************************
953 //contigsreport file has a column header line then all other lines contain 8 columns. we just want the first column since that contains the name
readContigs()954 void GetSeqsCommand::readContigs(){
955 try {
956 string thisOutputDir = outputdir;
957 if (outputdir == "") { thisOutputDir += util.hasPath(contigsreportfile); }
958 map<string, string> variables;
959 variables["[filename]"] = thisOutputDir + util.getRootName(util.getSimpleName(contigsreportfile));
960 string outputFileName = getOutputFileName("contigsreport", variables);
961 ofstream out; util.openOutputFile(outputFileName, out);
962
963 bool wroteSomething = false;
964 int selectedCount = 0;
965
966 set<string> uniqueNames;
967 ifstream in; util.openInputFile(contigsreportfile, in);
968
969 ContigsReport report;
970 report.readHeaders(in); util.gobble(in);
971 report.printHeaders(out);
972
973 while(!in.eof()){
974
975 if (m->getControl_pressed()) { break; }
976
977 report.read(in); util.gobble(in);
978 string name = report.getName();
979
980 if (!dups) {//adjust name if needed
981 map<string, string>::iterator it = uniqueMap.find(name);
982 if (it != uniqueMap.end()) { name = it->second; }
983 }
984
985 if (names.count(name) != 0) {
986 if (uniqueNames.count(name) == 0) { //this name hasn't been seen yet
987 uniqueNames.insert(name);
988 wroteSomething = true; selectedCount++;
989
990 report.print(out);
991 }else {
992 m->mothurOut("[WARNING]: " + name + " is in your contigsreport file more than once. Mothur requires sequence names to be unique. I will only add it once.\n");
993 }
994 }
995 }
996 in.close();
997 out.close();
998
999 if (wroteSomething == false) { m->mothurOut("Your file does not contain any sequence from the .accnos file.\n"); ofstream out1; util.openOutputFile(outputFileName, out1); out1.close(); } //reopening file clears header line
1000 outputNames.push_back(outputFileName); outputTypes["contigsreport"].push_back(outputFileName);
1001
1002 m->mothurOut("Selected " + toString(selectedCount) + " sequences from your contigsreport file.\n");
1003
1004 return;
1005 }
1006 catch(exception& e) {
1007 m->errorOut(e, "ListSeqsCommand", "readContigs");
1008 exit(1);
1009 }
1010 }
1011 //**********************************************************************************************************************
1012 //just looking at common mistakes.
runSanityCheck()1013 int GetSeqsCommand::runSanityCheck(){
1014 try {
1015 string thisOutputDir = outputdir;
1016 if (outputdir == "") { thisOutputDir += util.hasPath(fastafile); }
1017 string filename = outputdir + "get.seqs.debug.report";
1018
1019 ofstream out;
1020 util.openOutputFile(filename, out);
1021
1022
1023 //compare fasta, name, qual and taxonomy if given to make sure they contain the same seqs
1024 if (fastafile != "") {
1025 if (namefile != "") { //compare with fasta
1026 if (sanity["fasta"] != sanity["name"]) { //create mismatch file
1027 createMisMatchFile(out, fastafile, namefile, sanity["fasta"], sanity["name"]);
1028 }
1029 }
1030 if (qualfile != "") {
1031 if (sanity["fasta"] != sanity["qual"]) { //create mismatch file
1032 createMisMatchFile(out, fastafile, qualfile, sanity["fasta"], sanity["qual"]);
1033 }
1034 }
1035 if (taxfile != "") {
1036 if (sanity["fasta"] != sanity["tax"]) { //create mismatch file
1037 createMisMatchFile(out, fastafile, taxfile, sanity["fasta"], sanity["tax"]);
1038 }
1039 }
1040 }
1041
1042 //compare dupnames, groups and list if given to make sure they match
1043 if (namefile != "") {
1044 if (groupfile != "") {
1045 if (sanity["dupname"] != sanity["group"]) { //create mismatch file
1046 createMisMatchFile(out, namefile, groupfile, sanity["dupname"], sanity["group"]);
1047 }
1048 }
1049 if (listfile != "") {
1050 if (sanity["dupname"] != sanity["list"]) { //create mismatch file
1051 createMisMatchFile(out, namefile, listfile, sanity["dupname"], sanity["list"]);
1052 }
1053 }
1054 }else{
1055
1056 if ((groupfile != "") && (fastafile != "")) {
1057 if (sanity["fasta"] != sanity["group"]) { //create mismatch file
1058 createMisMatchFile(out, fastafile, groupfile, sanity["fasta"], sanity["group"]);
1059 }
1060 }
1061 }
1062
1063 out.close();
1064
1065 if (util.isBlank(filename)) { util.mothurRemove(filename); }
1066 else { m->mothurOut("\n[DEBUG]: " + filename + " contains the file mismatches.\n");outputNames.push_back(filename); outputTypes["debug"].push_back(filename); }
1067
1068 return 0;
1069 }
1070 catch(exception& e) {
1071 m->errorOut(e, "GetSeqsCommand", "runSanityCheck");
1072 exit(1);
1073 }
1074 }
1075 //**********************************************************************************************************************
1076 //just looking at common mistakes.
createMisMatchFile(ofstream & out,string filename1,string filename2,set<string> set1,set<string> set2)1077 int GetSeqsCommand::createMisMatchFile(ofstream& out, string filename1, string filename2, set<string> set1, set<string> set2){
1078 try {
1079 out << "****************************************" << endl << endl;
1080 out << "Names unique to " << filename1 << ":\n";
1081
1082 //remove names in set1 that are also in set2
1083 for (set<string>::iterator it = set1.begin(); it != set1.end();) {
1084 string name = *it;
1085
1086 if (set2.count(name) == 0) { out << name << endl; } //name unique to set1
1087 else { set2.erase(name); } //you are in both so erase
1088 set1.erase(it++);
1089 }
1090
1091 out << "\nNames unique to " << filename2 << ":\n";
1092 //output results
1093 for (set<string>::iterator it = set2.begin(); it != set2.end(); it++) { out << *it << endl; }
1094
1095 out << "****************************************" << endl << endl;
1096
1097 return 0;
1098 }
1099 catch(exception& e) {
1100 m->errorOut(e, "GetSeqsCommand", "runSanityCheck");
1101 exit(1);
1102 }
1103 }
1104 //**********************************************************************************************************************
1105
compareAccnos()1106 int GetSeqsCommand::compareAccnos(){
1107 try {
1108
1109 string thisOutputDir = outputdir;
1110 if (outputdir == "") { thisOutputDir += util.hasPath(accnosfile); }
1111 map<string, string> variables;
1112 variables["[filename]"] = thisOutputDir + util.getRootName(util.getSimpleName(accnosfile));
1113 string outputFileName = getOutputFileName("accnosreport", variables);
1114
1115 ofstream out;
1116 util.openOutputFile(outputFileName, out);
1117
1118 ifstream in;
1119 util.openInputFile(accnosfile2, in);
1120 string name;
1121
1122 set<string> namesAccnos2;
1123 set<string> namesDups;
1124 set<string> namesAccnos = names;
1125
1126 map<string, int> nameCount;
1127
1128 if (namefile != "") {
1129 ifstream inName;
1130 util.openInputFile(namefile, inName);
1131
1132
1133 while(!inName.eof()){
1134
1135 if (m->getControl_pressed()) { inName.close(); return 0; }
1136
1137 string thisname, repnames;
1138
1139 inName >> thisname; util.gobble(inName); //read from first column
1140 inName >> repnames; //read from second column
1141
1142 int num = util.getNumNames(repnames);
1143 nameCount[thisname] = num;
1144
1145 util.gobble(inName);
1146 }
1147 inName.close();
1148 }
1149
1150 while(!in.eof()){
1151 in >> name;
1152
1153 if (namesAccnos.count(name) == 0){ //name unique to accnos2
1154 int pos = name.find_last_of('_');
1155 string tempName = name;
1156 if (pos != string::npos) { tempName = tempName.substr(pos+1); }
1157 if (namesAccnos.count(tempName) == 0){
1158 namesAccnos2.insert(name);
1159 }else { //you are in both so erase
1160 namesAccnos.erase(name);
1161 namesDups.insert(name);
1162 }
1163 }else { //you are in both so erase
1164 namesAccnos.erase(name);
1165 namesDups.insert(name);
1166 }
1167
1168 util.gobble(in);
1169 }
1170 in.close();
1171
1172 out << "Names in both files : " + toString(namesDups.size()) << endl;
1173 m->mothurOut("Names in both files : " + toString(namesDups.size())); m->mothurOutEndLine();
1174
1175 for (set<string>::iterator it = namesDups.begin(); it != namesDups.end(); it++) {
1176 out << (*it);
1177 if (namefile != "") { out << '\t' << nameCount[(*it)]; }
1178 out << endl;
1179 }
1180
1181 out << "Names unique to " + accnosfile + " : " + toString(namesAccnos.size()) << endl;
1182 m->mothurOut("Names unique to " + accnosfile + " : " + toString(namesAccnos.size())); m->mothurOutEndLine();
1183
1184 for (set<string>::iterator it = namesAccnos.begin(); it != namesAccnos.end(); it++) {
1185 out << (*it);
1186 if (namefile != "") { out << '\t' << nameCount[(*it)]; }
1187 out << endl;
1188 }
1189
1190 out << "Names unique to " + accnosfile2 + " : " + toString(namesAccnos2.size()) << endl;
1191 m->mothurOut("Names unique to " + accnosfile2 + " : " + toString(namesAccnos2.size())); m->mothurOutEndLine();
1192
1193 for (set<string>::iterator it = namesAccnos2.begin(); it != namesAccnos2.end(); it++) {
1194 out << (*it);
1195 if (namefile != "") { out << '\t' << nameCount[(*it)]; }
1196 out << endl;
1197 }
1198
1199 out.close();
1200
1201 outputNames.push_back(outputFileName); outputTypes["accnosreport"].push_back(outputFileName);
1202
1203 return 0;
1204
1205 }
1206 catch(exception& e) {
1207 m->errorOut(e, "GetSeqsCommand", "compareAccnos");
1208 exit(1);
1209 }
1210 }
1211
1212
1213 //**********************************************************************************************************************
1214
1215