1 /*
2 * chimeraperseuscommand.cpp
3 * Mothur
4 *
5 * Created by westcott on 10/26/11.
6 * Copyright 2011 Schloss Lab. All rights reserved.
7 *
8 */
9
10 #include "chimeraperseuscommand.h"
11 #include "deconvolutecommand.h"
12 #include "sequence.hpp"
13 #include "counttable.h"
14 #include "sequencecountparser.h"
15 //**********************************************************************************************************************
setParameters()16 vector<string> ChimeraPerseusCommand::setParameters(){
17 try {
18 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none","chimera-accnos",false,true,true); parameters.push_back(pfasta);
19 CommandParameter pname("name", "InputTypes", "", "", "NameCount", "NameCount", "none","",false,false,true); parameters.push_back(pname);
20 CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "NameCount", "none","",false,false,true); parameters.push_back(pcount);
21 CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none","",false,false,true); parameters.push_back(pgroup);
22 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
23 CommandParameter pdups("dereplicate", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pdups);
24
25 CommandParameter pseed("seed", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pseed);
26 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
27 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
28 CommandParameter pcutoff("cutoff", "Number", "", "0.5", "", "", "","",false,false); parameters.push_back(pcutoff);
29 CommandParameter palpha("alpha", "Number", "", "-5.54", "", "", "","",false,false); parameters.push_back(palpha);
30 CommandParameter pbeta("beta", "Number", "", "0.33", "", "", "","",false,false); parameters.push_back(pbeta);
31
32 abort = false; calledHelp = false;
33
34 vector<string> tempOutNames;
35 outputTypes["chimera"] = tempOutNames;
36 outputTypes["accnos"] = tempOutNames;
37 outputTypes["count"] = tempOutNames;
38
39 vector<string> myArray;
40 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
41 return myArray;
42 }
43 catch(exception& e) {
44 m->errorOut(e, "ChimeraPerseusCommand", "setParameters");
45 exit(1);
46 }
47 }
48 //**********************************************************************************************************************
getHelpString()49 string ChimeraPerseusCommand::getHelpString(){
50 try {
51 string helpString = "";
52 helpString += "The chimera.perseus command reads a fastafile and namefile or countfile and outputs potentially chimeric sequences.\n";
53 helpString += "The chimera.perseus command parameters are fasta, name, group, cutoff, processors, dereplicate, alpha and beta.\n";
54 helpString += "The fasta parameter allows you to enter the fasta file containing your potentially chimeric sequences, and is required, unless you have a valid current fasta file. \n";
55 helpString += "The name parameter allows you to provide a name file associated with your fasta file.\n";
56 helpString += "The count parameter allows you to provide a count file associated with your fasta file. A count or name file is required. When you use a count file with group info and dereplicate=T, mothur will create a *.pick.count_table file containing seqeunces after chimeras are removed.\n";
57 helpString += "The group parameter allows you to provide a group file. When checking sequences, only sequences from the same group as the query sequence will be used as the reference. \n";
58 helpString += "The processors parameter allows you to specify how many processors you would like to use. The default is 1. \n";
59 helpString += "If the dereplicate parameter is false, then if one group finds the seqeunce to be chimeric, then all groups find it to be chimeric, default=f.\n";
60 helpString += "The alpha parameter .... The default is -5.54. \n";
61 helpString += "The beta parameter .... The default is 0.33. \n";
62 helpString += "The cutoff parameter .... The default is 0.50. \n";
63 helpString += "The chimera.perseus command should be in the following format: \n";
64 helpString += "chimera.perseus(fasta=yourFastaFile, name=yourNameFile) \n";
65 helpString += "Example: chimera.perseus(fasta=AD.align, name=AD.names) \n";
66
67 return helpString;
68 }
69 catch(exception& e) {
70 m->errorOut(e, "ChimeraPerseusCommand", "getHelpString");
71 exit(1);
72 }
73 }
74 //**********************************************************************************************************************
getOutputPattern(string type)75 string ChimeraPerseusCommand::getOutputPattern(string type) {
76 try {
77 string pattern = "";
78
79 if (type == "chimera") { pattern = "[filename],perseus.chimeras"; }
80 else if (type == "accnos") { pattern = "[filename],perseus.accnos"; }
81 else if (type == "count") { pattern = "[filename],perseus.pick.count_table"; }
82 else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->setControl_pressed(true); }
83
84 return pattern;
85 }
86 catch(exception& e) {
87 m->errorOut(e, "ChimeraPerseusCommand", "getOutputPattern");
88 exit(1);
89 }
90 }
91 //***************************************************************************************************************
ChimeraPerseusCommand(string option)92 ChimeraPerseusCommand::ChimeraPerseusCommand(string option) : Command() {
93 try {
94 hasCount = false;
95
96 //allow user to run help
97 if(option == "help") { help(); abort = true; calledHelp = true; }
98 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
99 else if(option == "category") { abort = true; calledHelp = true; }
100
101 else {
102 OptionParser parser(option, setParameters());
103 map<string,string> parameters = parser.getParameters();
104
105 //check for required parameters
106 ValidParameters validParameter;
107 fastafile = validParameter.validFile(parameters, "fasta");
108 if (fastafile == "not found") {
109 fastafile = current->getFastaFile();
110 if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter.\n"); }
111 else { m->mothurOut("[ERROR]: You have no current fasta file and the fasta parameter is required.\n"); abort = true; }
112 }
113 else if (fastafile == "not open") { abort = true; }
114 else { current->setFastaFile(fastafile); }
115
116 bool hasName = false;
117 string namefile = validParameter.validFile(parameters, "name");
118 if (namefile == "not open") { namefile = ""; abort = true; }
119 else if (namefile == "not found") { namefile = ""; }
120 else { current->setNameFile(namefile); }
121 if (namefile != "") { hasName = true; }
122
123 countfile = validParameter.validFile(parameters, "count");
124 if (countfile == "not open") { countfile = ""; abort = true; }
125 else if (countfile == "not found") { countfile = ""; }
126 else { current->setCountFile(countfile); }
127 if (countfile != "") { hasCount = true; }
128
129 //make sure there is at least one valid file left
130 if (hasName && hasCount) { m->mothurOut("[ERROR]: You must enter ONLY ONE of the following: count or name.\n"); abort = true; }
131
132 if (!hasName && !hasCount) {
133 //if there is a current name file, use it, else look for current count file
134 string filename = current->getNameFile();
135 if (filename != "") { hasName = true; namefile = filename; m->mothurOut("Using " + filename + " as input file for the name parameter.\n"); }
136 else {
137 filename = current->getCountFile();
138 if (filename != "") { hasCount = true; countfile = filename; m->mothurOut("Using " + filename + " as input file for the count parameter.\n"); }
139 else { m->mothurOut("[ERROR]: You must provide a count or name file.\n"); abort = true; }
140 }
141 }
142
143 bool hasGroup = false;
144 string groupfile = validParameter.validFile(parameters, "group");
145 if (groupfile == "not open") { abort = true; }
146 else if (groupfile == "not found") { groupfile = ""; }
147 else { current->setGroupFile(groupfile); hasGroup = true; }
148
149 if (hasGroup && hasCount) { m->mothurOut("[ERROR]: You must enter ONLY ONE of the following: count or group.\n"); abort = true; }
150
151 string temp = validParameter.valid(parameters, "processors"); if (temp == "not found"){ temp = current->getProcessors(); }
152 processors = current->setProcessors(temp);
153
154 temp = validParameter.valid(parameters, "cutoff"); if (temp == "not found"){ temp = "0.50"; }
155 util.mothurConvert(temp, cutoff);
156
157 temp = validParameter.valid(parameters, "alpha"); if (temp == "not found"){ temp = "-5.54"; }
158 util.mothurConvert(temp, alpha);
159
160 temp = validParameter.valid(parameters, "beta"); if (temp == "not found"){ temp = "0.33"; }
161 util.mothurConvert(temp, beta);
162
163 temp = validParameter.valid(parameters, "dereplicate");
164 if (temp == "not found") { temp = "false"; }
165 dups = util.isTrue(temp);
166
167 if (!abort) {
168 if ((namefile != "") || (groupfile != "")) { //convert to count
169
170 string rootFileName = namefile;
171 if (rootFileName == "") { rootFileName = groupfile; }
172
173 if (outputdir == "") { outputdir = util.hasPath(rootFileName); }
174 string outputFileName = outputdir + util.getRootName(util.getSimpleName(rootFileName)) + "count_table";
175
176 CountTable ct; ct.createTable(namefile, groupfile, nullVector); ct.printCompressedTable(outputFileName);
177 outputNames.push_back(outputFileName);
178
179 current->setCountFile(outputFileName);
180 countfile = outputFileName;
181 hasCount = true;
182 }
183 }
184 }
185 }
186 catch(exception& e) {
187 m->errorOut(e, "ChimeraPerseusCommand", "ChimeraPerseusCommand");
188 exit(1);
189 }
190 }
191 /**************************************************************************************************/
192 struct perseusData {
193 Utils util;
194 MothurOut* m;
195 vector<seqData> sequences;
196 string group;
197 int count, numChimeras;
198 string chimeraFileName;
199 string accnosFileName;
200 double alpha, beta, cutoff;
201
perseusDataperseusData202 perseusData(string cf, string ac, double a, double b, double c){
203 m = MothurOut::getInstance();
204 count = 0;
205 numChimeras = 0;
206 accnosFileName = ac;
207 chimeraFileName = cf;
208 alpha = a;
209 beta = b;
210 cutoff = c;
211 }
212 };
213 //**********************************************************************************************************************
214 //void driver(string chimeraFileName, vector<seqData>& sequences, string accnosFileName, int& numChimeras){
driver(perseusData * params)215 void driver(perseusData* params){
216 try {
217 vector<vector<double> > correctModel(4); //could be an option in the future to input own model matrix
218 for(int i=0;i<4;i++){ correctModel[i].resize(4); }
219
220 correctModel[0][0] = 0.000000; //AA
221 correctModel[1][0] = 11.619259; //CA
222 correctModel[2][0] = 11.694004; //TA
223 correctModel[3][0] = 7.748623; //GA
224
225 correctModel[1][1] = 0.000000; //CC
226 correctModel[2][1] = 7.619657; //TC
227 correctModel[3][1] = 12.852562; //GC
228
229 correctModel[2][2] = 0.000000; //TT
230 correctModel[3][2] = 10.964048; //TG
231
232 correctModel[3][3] = 0.000000; //GG
233
234 for(int i=0;i<4;i++){ for(int j=0;j<i;j++){ correctModel[j][i] = correctModel[i][j]; } }
235
236 int numSeqs = params->sequences.size();
237 int alignLength = params->sequences[0].sequence.size();
238
239 ofstream chimeraFile;
240 ofstream accnosFile;
241 params->util.openOutputFile(params->chimeraFileName, chimeraFile);
242 params->util.openOutputFile(params->accnosFileName, accnosFile);
243
244 Perseus myPerseus;
245 vector<vector<double> > binMatrix = myPerseus.binomial(alignLength);
246
247 chimeraFile << "SequenceIndex\tName\tDiffsToBestMatch\tBestMatchIndex\tBestMatchName\tDiffstToChimera\tIndexofLeftParent\tIndexOfRightParent\tNameOfLeftParent\tNameOfRightParent\tDistanceToBestMatch\tcIndex\t(cIndex - singleDist)\tloonIndex\tMismatchesToChimera\tMismatchToTrimera\tChimeraBreakPoint\tLogisticProbability\tTypeOfSequence\n";
248
249 vector<bool> chimeras(numSeqs, 0);
250
251 for(int i=0;i<numSeqs;i++){
252 if (params->m->getControl_pressed()) { chimeraFile.close(); accnosFile.close(); break; }
253
254 vector<bool> restricted = chimeras;
255
256 vector<vector<int> > leftDiffs(numSeqs);
257 vector<vector<int> > leftMaps(numSeqs);
258 vector<vector<int> > rightDiffs(numSeqs);
259 vector<vector<int> > rightMaps(numSeqs);
260
261 vector<int> singleLeft, bestLeft;
262 vector<int> singleRight, bestRight;
263
264 int bestSingleIndex, bestSingleDiff;
265 vector<pwAlign> alignments(numSeqs);
266
267 int comparisons = myPerseus.getAlignments(i, params->sequences, alignments, leftDiffs, leftMaps, rightDiffs, rightMaps, bestSingleIndex, bestSingleDiff, restricted);
268 if (params->m->getControl_pressed()) { chimeraFile.close(); accnosFile.close(); break; }
269
270 int minMismatchToChimera, leftParentBi, rightParentBi, breakPointBi;
271
272 string dummyA, dummyB;
273
274 if (params->sequences[i].sequence.size() < 3) {
275 chimeraFile << i << '\t' << params->sequences[i].seqName << "\t0\t0\tNull\t0\t0\t0\tNull\tNull\t0.0\t0.0\t0.0\t0\t0\t0\t0.0\t0.0\tgood" << endl;
276 }else if(comparisons >= 2){
277 minMismatchToChimera = myPerseus.getChimera(params->sequences, leftDiffs, rightDiffs, leftParentBi, rightParentBi, breakPointBi, singleLeft, bestLeft, singleRight, bestRight, restricted);
278 if (params->m->getControl_pressed()) { chimeraFile.close(); accnosFile.close(); break; }
279
280 int minMismatchToTrimera = numeric_limits<int>::max();
281 int leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB;
282
283 if(minMismatchToChimera >= 3 && comparisons >= 3){
284 minMismatchToTrimera = myPerseus.getTrimera(params->sequences, leftDiffs, leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB, singleLeft, bestLeft, singleRight, bestRight, restricted);
285 if (params->m->getControl_pressed()) { chimeraFile.close(); accnosFile.close(); break; }
286 }
287
288 double singleDist = myPerseus.modeledPairwiseAlignSeqs(params->sequences[i].sequence, params->sequences[bestSingleIndex].sequence, dummyA, dummyB, correctModel);
289
290 if (params->m->getControl_pressed()) { chimeraFile.close(); accnosFile.close(); break; }
291
292 string type;
293 string chimeraRefSeq;
294
295 if(minMismatchToChimera - minMismatchToTrimera >= 3){
296 type = "trimera";
297 chimeraRefSeq = myPerseus.stitchTrimera(alignments, leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB, leftMaps, rightMaps);
298 }
299 else{
300 type = "chimera";
301 chimeraRefSeq = myPerseus.stitchBimera(alignments, leftParentBi, rightParentBi, breakPointBi, leftMaps, rightMaps);
302 }
303
304 if (params->m->getControl_pressed()) { chimeraFile.close(); accnosFile.close(); break; }
305
306 double chimeraDist = myPerseus.modeledPairwiseAlignSeqs(params->sequences[i].sequence, chimeraRefSeq, dummyA, dummyB, correctModel);
307
308 if (params->m->getControl_pressed()) { chimeraFile.close(); accnosFile.close(); break; }
309
310 double cIndex = chimeraDist;//modeledPairwiseAlignSeqs(sequences[i].sequence, chimeraRefSeq);
311 double loonIndex = myPerseus.calcLoonIndex(params->sequences[i].sequence, params->sequences[leftParentBi].sequence, params->sequences[rightParentBi].sequence, breakPointBi, binMatrix);
312
313 if (params->m->getControl_pressed()) { chimeraFile.close(); accnosFile.close(); break; }
314
315 chimeraFile << i << '\t' << params->sequences[i].seqName << '\t' << bestSingleDiff << '\t' << bestSingleIndex << '\t' << params->sequences[bestSingleIndex].seqName << '\t';
316 chimeraFile << minMismatchToChimera << '\t' << leftParentBi << '\t' << rightParentBi << '\t' << params->sequences[leftParentBi].seqName << '\t' << params->sequences[rightParentBi].seqName << '\t';
317 chimeraFile << singleDist << '\t' << cIndex << '\t' << (cIndex - singleDist) << '\t' << loonIndex << '\t';
318 chimeraFile << minMismatchToChimera << '\t' << minMismatchToTrimera << '\t' << breakPointBi << '\t';
319
320 double probability = myPerseus.classifyChimera(singleDist, cIndex, loonIndex, params->alpha, params->beta);
321
322 chimeraFile << probability << '\t';
323
324 if(probability > params->cutoff){
325 chimeraFile << type << endl;
326 accnosFile << params->sequences[i].seqName << endl;
327 chimeras[i] = 1;
328 params->numChimeras++;
329 }
330 else{ chimeraFile << "good" << endl; }
331 }
332 else{
333 chimeraFile << i << '\t' << params->sequences[i].seqName << "\t0\t0\tNull\t0\t0\t0\tNull\tNull\t0.0\t0.0\t0.0\t0\t0\t0\t0.0\t0.0\tgood" << endl;
334 }
335
336 //report progress
337 if((i+1) % 100 == 0){ params->m->mothurOutJustToScreen("Processing sequence: " + toString(i+1) + "\n"); }
338 params->count++; //# of sequences completed. Used by calling function to check for failure
339 }
340
341 if((numSeqs) % 100 != 0){ params->m->mothurOutJustToScreen("Processing sequence: " + toString(numSeqs) + "\n"); }
342
343 if (!params->m->getControl_pressed()) { chimeraFile.close(); accnosFile.close(); }
344 }
345 catch(exception& e) {
346 params->m->errorOut(e, "ChimeraPerseusCommand", "driver");
347 exit(1);
348 }
349 }
350 //***************************************************************************************************************
351
execute()352 int ChimeraPerseusCommand::execute(){
353 try{
354 if (abort) { if (calledHelp) { return 0; } return 2; }
355
356 m->mothurOut("Checking sequences from " + fastafile + " ...\n" );
357
358 long start = time(NULL);
359 if (outputdir == "") { outputdir = util.hasPath(fastafile); }
360 map<string, string> variables;
361 variables["[filename]"] = outputdir + util.getRootName(util.getSimpleName(fastafile));
362 string outputFileName = getOutputFileName("chimera", variables);
363 string accnosFileName = getOutputFileName("accnos", variables);
364 string newCountFile = "";
365
366 if (countfile == "") { countfile = getCountFile(fastafile); hasCount=true; }
367
368 if (m->getControl_pressed()) { return 0; }
369
370 int numSeqs = 0; int numChimeras = 0;
371
372 if (hasCount) {
373 CountTable ct;
374 vector<string> groups;
375 if (ct.testGroups(countfile, groups)) { //fills groups if count file has them
376
377 variables["[filename]"] = outputdir + util.getRootName(util.getSimpleName(countfile));
378 newCountFile = getOutputFileName("count", variables);
379
380 vector<string> groups;
381 map<string, vector<string> > group2Files;
382 if (hasCount) {
383 current->setMothurCalling(true);
384 SequenceCountParser cparser(countfile, fastafile, nullVector);
385 current->setMothurCalling(false);
386 groups = cparser.getNamesOfGroups();
387 group2Files = cparser.getFiles();
388
389 }
390
391 if (m->getControl_pressed()) { return 0; }
392
393 //clears files
394 ofstream out, out1, out2;
395 util.openOutputFile(outputFileName, out); out.close();
396 util.openOutputFile(accnosFileName, out1); out1.close();
397
398 string countlist = accnosFileName+".byCount";
399 numSeqs = createProcessesGroups(group2Files, outputFileName, countlist, accnosFileName, newCountFile, groups, fastafile, countfile, numChimeras);
400
401 if (m->getControl_pressed()) { for (int j = 0; j < outputNames.size(); j++) { util.mothurRemove(outputNames[j]); } return 0; }
402
403 if (!dups) {
404 numChimeras = deconvoluteResults(outputFileName, accnosFileName);
405 }else {
406 CountTable newCount; newCount.readTable(countfile, true, false);
407
408 if (!util.isBlank(countlist)) {
409 ifstream in2;
410 util.openInputFile(countlist, in2);
411
412 string name, group;
413 while (!in2.eof()) {
414 in2 >> name; util.gobble(in2); in2 >> group; util.gobble(in2);
415 newCount.setAbund(name, group, 0);
416 }
417 in2.close();
418 }
419 util.mothurRemove(countlist);
420
421 //print new *.pick.count_table
422 vector<string> namesInTable = newCount.printTable(newCountFile); //returns non zeroed names
423 outputNames.push_back(newCountFile); outputTypes["count"].push_back(newCountFile);
424
425 set<string> doNotRemove = util.mothurConvert(namesInTable);
426
427 //remove names we want to keep from accnos file.
428 set<string> accnosNames = util.readAccnos(accnosFileName);
429 ofstream out2; util.openOutputFile(accnosFileName, out2);
430 for (set<string>::iterator it = accnosNames.begin(); it != accnosNames.end(); it++) { if (doNotRemove.count(*it) == 0) { out2 << (*it) << endl; } }
431 out2.close();
432 }
433
434 util.mothurRemove(countlist);
435 m->mothurOut("The number of sequences checked may be larger than the number of unique sequences because some sequences are found in several samples.\n");
436
437 if (m->getControl_pressed()) { for (int j = 0; j < outputNames.size(); j++) { util.mothurRemove(outputNames[j]); } return 0; }
438
439 }else {
440 if (processors != 1) { m->mothurOut("Your count file does not contain group information, mothur can only use 1 processor, continuing.\n"); processors = 1; }
441
442 //read sequences and store sorted by frequency
443 ct.readTable(countfile, false, false);
444 vector<seqData> sequences = readFiles(fastafile, ct.getNameMap());
445
446 if (m->getControl_pressed()) { for (int j = 0; j < outputNames.size(); j++) { util.mothurRemove(outputNames[j]); } return 0; }
447
448 perseusData* dataBundle = new perseusData(outputFileName, accnosFileName, alpha, beta, cutoff);
449 dataBundle->sequences = sequences;
450 driver(dataBundle);
451 numSeqs = dataBundle->count; numChimeras = dataBundle->numChimeras;
452 delete dataBundle;
453 }
454 }
455
456 if (m->getControl_pressed()) { for (int j = 0; j < outputNames.size(); j++) { util.mothurRemove(outputNames[j]); } return 0; }
457
458 m->mothurOut("\nIt took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences. " + toString(numChimeras) + " chimeras were found.\n");
459 outputNames.push_back(outputFileName); outputTypes["chimera"].push_back(outputFileName);
460 outputNames.push_back(accnosFileName); outputTypes["accnos"].push_back(accnosFileName);
461
462
463 //set accnos file as new current accnosfile
464 string currentName = "";
465 itTypes = outputTypes.find("accnos");
466 if (itTypes != outputTypes.end()) {
467 if ((itTypes->second).size() != 0) { currentName = (itTypes->second)[0]; current->setAccnosFile(currentName); }
468 }
469
470 itTypes = outputTypes.find("count");
471 if (itTypes != outputTypes.end()) {
472 if ((itTypes->second).size() != 0) { currentName = (itTypes->second)[0]; current->setCountFile(currentName); }
473 }
474
475 m->mothurOut("\nOutput File Names: \n");
476 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
477 m->mothurOutEndLine();
478
479 return 0;
480
481 }
482 catch(exception& e) {
483 m->errorOut(e, "ChimeraPerseusCommand", "execute");
484 exit(1);
485 }
486 }
487 //**********************************************************************************************************************
getCountFile(string & inputFile)488 string ChimeraPerseusCommand::getCountFile(string& inputFile){
489 try {
490 string countFile = "";
491
492 m->mothurOut("\nNo count file given, running unique.seqs command to generate one.\n\n");
493
494 //use unique.seqs to create new name and fastafile
495 string inputString = "format=count, fasta=" + inputFile;
496 m->mothurOut("/******************************************/\n");
497 m->mothurOut("Running command: unique.seqs(" + inputString + ")\n");
498 current->setMothurCalling(true);
499
500 Command* uniqueCommand = new DeconvoluteCommand(inputString);
501 uniqueCommand->execute();
502
503 map<string, vector<string> > filenames = uniqueCommand->getOutputFiles();
504
505 delete uniqueCommand;
506 current->setMothurCalling(false);
507 m->mothurOut("/******************************************/\n");
508
509 countFile = filenames["count"][0];
510 inputFile = filenames["fasta"][0];
511
512 return countFile;
513 }
514 catch(exception& e) {
515 m->errorOut(e, "ChimeraPerseusCommand", "getNamesFile");
516 exit(1);
517 }
518 }
519 /**************************************************************************************************/
520 struct perseusGroupsData {
521 string fastafile;
522 string dupsfile;
523 string chimeraFileName;
524 string accnosFileName;
525 string countlist;
526 map<string, vector<string> > parsedFiles;
527
528 bool hasCount, dups;
529 int threadID, count, numChimeras;
530 double alpha, beta, cutoff;
531 vector<string> groups;
532 Utils util;
533 MothurOut* m;
534
perseusGroupsDataperseusGroupsData535 perseusGroupsData(){}
perseusGroupsDataperseusGroupsData536 perseusGroupsData(map<string, vector<string> >& g2f,bool dps, bool hc, double a, double b, double c, string o, string f, string n, string ac, string ctlist, vector<string> gr, int tid) {
537 alpha = a;
538 beta = b;
539 cutoff = c;
540 fastafile = f;
541 dupsfile = n;
542 chimeraFileName = o;
543 countlist = ctlist;
544 accnosFileName = ac;
545 m = MothurOut::getInstance();
546 threadID = tid;
547 groups = gr;
548 hasCount = hc;
549 dups = dps;
550 count = 0;
551 numChimeras = 0;
552 parsedFiles = g2f;
553 }
554 };
555 //**********************************************************************************************************************
loadSequences(map<string,int> & nameMap,string thisGroupsFastaFile,perseusGroupsData * params)556 vector<seqData> loadSequences(map<string, int>& nameMap, string thisGroupsFastaFile, perseusGroupsData* params){
557 try {
558 bool error = false;
559 vector<seqData> sequences;
560
561 ifstream in;
562 params->util.openInputFile(thisGroupsFastaFile, in);
563
564 vector<seqPriorityNode> nameVector;
565 map<string, int>::iterator itNameMap;
566 while (!in.eof()) {
567 if (params->m->getControl_pressed()) { break; }
568
569 Sequence seq(in); params->util.gobble(in);
570
571 itNameMap = nameMap.find(seq.getName());
572
573 if (itNameMap == nameMap.end()){
574 error = true;
575 params->m->mothurOut("[ERROR]: " + seq.getName() + " is in your fastafile, but is not in your name or count file, please correct.\n");
576 }else {
577 int num = itNameMap->second;
578
579 seq.setAligned(params->util.removeNs(seq.getUnaligned()));
580 sequences.push_back(seqData(seq.getName(), seq.getUnaligned(), num));
581 }
582 }
583 in.close();
584
585 if (error) { params->m->setControl_pressed(true); }
586
587 //sort by frequency
588 sort(sequences.rbegin(), sequences.rend());
589
590 return sequences;
591 }
592 catch(exception& e) {
593 params->m->errorOut(e, "ChimeraPerseusCommand", "loadSequences");
594 exit(1);
595 }
596 }
597
598 //**********************************************************************************************************************
599 //string outputFName, string accnos, string countlist, int start, int end, vector<string> groups
driverGroups(perseusGroupsData * params)600 void driverGroups(perseusGroupsData* params){
601 try {
602 //clears files
603 ofstream out, out1, out2;
604 params->util.openOutputFile(params->chimeraFileName, out); out.close();
605 params->util.openOutputFile(params->accnosFileName, out1); out1.close();
606
607 int totalSeqs = 0;
608 ofstream outCountList;
609 if (params->hasCount && params->dups) { params->util.openOutputFile(params->countlist, outCountList); }
610
611 for (map<string, vector<string> >::iterator it = params->parsedFiles.begin(); it != params->parsedFiles.end(); it++) {
612 long start = time(NULL); if (params->m->getControl_pressed()) { break; }
613
614
615 string thisGroup = it->first;
616
617 map<string, int> nameMap;
618 if (params->hasCount) {
619 CountTable ct; ct.readTable(it->second[1], false, true);
620 nameMap = ct.getNameMap();
621 }
622
623 params->m->mothurOut("\nChecking sequences from group " + thisGroup + "...\n");
624
625 perseusData* driverParams = new perseusData((params->chimeraFileName+thisGroup), (params->accnosFileName+thisGroup), params->alpha, params->beta, params->cutoff);
626 driverParams->sequences = loadSequences(nameMap, it->second[0], params);
627
628 if (params->m->getControl_pressed()) { break; }
629
630 driver(driverParams);
631 totalSeqs += driverParams->count;
632 params->numChimeras += driverParams->numChimeras;
633
634 if (params->m->getControl_pressed()) { break; }
635
636 if (params->dups) {
637 if (!params->util.isBlank(driverParams->accnosFileName)) {
638 ifstream in;
639 params->util.openInputFile(driverParams->accnosFileName, in);
640 string name;
641 if (params->hasCount) {
642 while (!in.eof()) {
643 in >> name; params->util.gobble(in);
644 outCountList << name << '\t' << thisGroup << endl;
645 }
646 in.close();
647 }
648 }
649 }
650
651 //append files
652 params->util.appendFiles(driverParams->chimeraFileName, params->chimeraFileName); params->util.mothurRemove(driverParams->chimeraFileName);
653 params->util.appendFiles(driverParams->accnosFileName, params->accnosFileName); params->util.mothurRemove(driverParams->accnosFileName);
654
655 params->m->mothurOut("\nIt took " + toString(time(NULL) - start) + " secs to check " + toString(driverParams->count) + " sequences from group " + thisGroup + ".\n");
656 delete driverParams;
657 }
658
659 if (params->hasCount && params->dups) { outCountList.close(); }
660
661 params->count = totalSeqs;
662
663 }
664 catch(exception& e) {
665 params->m->errorOut(e, "ChimeraPerseusCommand", "driverGroups");
666 exit(1);
667 }
668 }
669 //**********************************************************************************************************************
readFiles(string inputFile,map<string,int> nameMap)670 vector<seqData> ChimeraPerseusCommand::readFiles(string inputFile, map<string, int> nameMap){
671 try {
672 map<string, int>::iterator it;
673
674 //read fasta file and create sequenceData structure - checking for file mismatches
675 vector<seqData> sequences;
676 bool error = false;
677 ifstream in;
678 util.openInputFile(inputFile, in);
679 alignLength = 0;
680
681 while (!in.eof()) {
682
683 if (m->getControl_pressed()) { in.close(); return sequences; }
684
685 Sequence temp(in); util.gobble(in);
686
687 it = nameMap.find(temp.getName());
688 if (it == nameMap.end()) { error = true; m->mothurOut("[ERROR]: " + temp.getName() + " is in your fasta file and not in your namefile, please correct.\n"); }
689 else {
690 temp.setAligned(util.removeNs(temp.getUnaligned()));
691 sequences.push_back(seqData(temp.getName(), temp.getUnaligned(), it->second));
692 if (temp.getUnaligned().length() > alignLength) { alignLength = temp.getUnaligned().length(); }
693 }
694 }
695 in.close();
696
697 if (error) { m->setControl_pressed(true); }
698
699 //sort by frequency
700 sort(sequences.rbegin(), sequences.rend());
701
702 return sequences;
703 }
704 catch(exception& e) {
705 m->errorOut(e, "ChimeraPerseusCommand", "readFiles");
706 exit(1);
707 }
708 }
709 /**************************************************************************************************/
710 //perseusData(vector<seqData>& s, double a, double b, double c, string o, string ac, MothurOut* mout)
711 //numSeqs = createProcessesGroups(outputFileName, countlist, accnosFileName, newCountFile, groups, fastafile, countfile, numChimeras);
createProcessesGroups(map<string,vector<string>> & parsedFiles,string outputFName,string countlisttemp,string accnos,string newCountFile,vector<string> groups,string fasta,string dupsFile,int & numChimeras)712 int ChimeraPerseusCommand::createProcessesGroups(map<string, vector<string> >& parsedFiles, string outputFName, string countlisttemp, string accnos, string newCountFile, vector<string> groups, string fasta, string dupsFile, int& numChimeras) {
713 try {
714 numChimeras = 0;
715
716 //sanity check
717 if (groups.size() < processors) { processors = groups.size(); m->mothurOut("Reducing processors to " + toString(groups.size()) + ".\n"); }
718
719 //divide the groups between the processors
720 vector<linePair> lines;
721 int remainingPairs = groups.size();
722 int startIndex = 0;
723 for (int remainingProcessors = processors; remainingProcessors > 0; remainingProcessors--) {
724 int numPairs = remainingPairs; //case for last processor
725 if (remainingProcessors != 1) { numPairs = ceil(remainingPairs / remainingProcessors); }
726 lines.push_back(linePair(startIndex, (startIndex+numPairs))); //startIndex, endIndex
727 startIndex = startIndex + numPairs;
728 remainingPairs = remainingPairs - numPairs;
729 }
730
731 //create array of worker threads
732 vector<std::thread*> workerThreads;
733 vector<perseusGroupsData*> data;
734
735 long long num = 0;
736 time_t start, end;
737 time(&start);
738
739 //Lauch worker threads
740 for (int i = 0; i < processors-1; i++) {
741 string extension = toString(i+1) + ".temp";
742 vector<string> thisGroups;
743 map<string, vector<string> > thisGroupsParsedFiles;
744 for (int j = lines[i+1].start; j < lines[i+1].end; j++) {
745
746 map<string, vector<string> >::iterator it = parsedFiles.find(groups[j]);
747 if (it != parsedFiles.end()) {
748 thisGroupsParsedFiles[groups[j]] = (it->second);
749 thisGroups.push_back(groups[j]);
750 }
751 else { m->mothurOut("[ERROR]: missing files for group " + groups[j] + ", skipping\n"); }
752 }
753 perseusGroupsData* dataBundle = new perseusGroupsData(thisGroupsParsedFiles, dups, hasCount, alpha, beta, cutoff, (outputFName+extension), fasta, dupsFile, (accnos+extension), (countlisttemp+extension), thisGroups, (i+1));
754 data.push_back(dataBundle);
755
756 workerThreads.push_back(new std::thread(driverGroups, dataBundle));
757 }
758
759 vector<string> thisGroups;
760 map<string, vector<string> > thisGroupsParsedFiles;
761 for (int j = lines[0].start; j < lines[0].end; j++) {
762
763 map<string, vector<string> >::iterator it = parsedFiles.find(groups[j]);
764 if (it != parsedFiles.end()) {
765 thisGroupsParsedFiles[groups[j]] = (it->second);
766 thisGroups.push_back(groups[j]);
767 }
768 else { m->mothurOut("[ERROR]: missing files for group " + groups[j] + ", skipping\n"); }
769 }
770 perseusGroupsData* dataBundle = new perseusGroupsData(thisGroupsParsedFiles, dups, hasCount, alpha, beta, cutoff, outputFName, fasta, dupsFile, accnos, countlisttemp, thisGroups, 0);
771 driverGroups(dataBundle);
772 num = dataBundle->count;
773 numChimeras = dataBundle->numChimeras;
774
775 for (int i = 0; i < processors-1; i++) {
776 workerThreads[i]->join();
777 num += data[i]->count;
778 numChimeras += data[i]->numChimeras;
779
780 string extension = toString(i+1) + ".temp";
781 util.appendFiles((outputFName+extension), outputFName);
782 util.mothurRemove((outputFName+extension));
783
784 util.appendFiles((accnos+extension), accnos);
785 util.mothurRemove((accnos+extension));
786
787 util.appendFiles((countlisttemp+extension), countlisttemp);
788 util.mothurRemove((countlisttemp+extension));
789
790 delete data[i];
791 delete workerThreads[i];
792 }
793 delete dataBundle;
794
795 time(&end);
796 m->mothurOut("It took " + toString(difftime(end, start)) + " secs to check " + toString(num) + " sequences.\n\n");
797
798 return num;
799
800 }
801 catch(exception& e) {
802 m->errorOut(e, "ChimeraPerseusCommand", "createProcessesGroups");
803 exit(1);
804 }
805 }
806 //**********************************************************************************************************************
deconvoluteResults(string outputFileName,string accnosFileName)807 int ChimeraPerseusCommand::deconvoluteResults(string outputFileName, string accnosFileName){
808 try {
809 int total = 0;
810
811 set<string> chimerasInFile = util.readAccnos(accnosFileName);//this is so if a sequence is found to be chimera in several samples we dont write it to the results file more than once
812 util.printAccnos(accnosFileName, chimerasInFile);
813
814 //edit chimera file
815 ifstream in;
816 util.openInputFile(outputFileName, in);
817
818 ofstream out;
819 util.openOutputFile(outputFileName+".temp", out); out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
820
821 int DiffsToBestMatch, BestMatchIndex, DiffstToChimera, IndexofLeftParent, IndexOfRightParent;
822 float temp1,temp2, temp3, temp4, temp5, temp6, temp7, temp8;
823 string index, BestMatchName, parent1, parent2, flag;
824 string name = "";
825 set<string> namesInFile;
826
827 //assumptions - in file each read will always look like
828 /*
829 SequenceIndex Name DiffsToBestMatch BestMatchIndex BestMatchName DiffstToChimera IndexofLeftParent IndexOfRightParent NameOfLeftParent NameOfRightParent DistanceToBestMatch cIndex (cIndex - singleDist) loonIndex MismatchesToChimera MismatchToTrimera ChimeraBreakPoint LogisticProbability TypeOfSequence
830 0 F01QG4L02JVBQY 0 0 Null 0 0 0 Null Null 0.0 0.0 0.0 0.0 0 0 0 0.0 0.0 good
831 1 F01QG4L02ICTC6 0 0 Null 0 0 0 Null Null 0.0 0.0 0.0 0.0 0 0 0 0.0 0.0 good
832 2 F01QG4L02JZOEC 48 0 F01QG4L02JVBQY 47 0 0 F01QG4L02JVBQY F01QG4L02JVBQY 2.0449 2.03545 -0.00944493 0 47 2147483647 138 0 good
833 3 F01QG4L02G7JEC 42 0 F01QG4L02JVBQY 40 1 0 F01QG4L02ICTC6 F01QG4L02JVBQY 1.87477 1.81113 -0.0636404 5.80145 40 2147483647 25 0 good
834 */
835
836 //get and print headers
837 BestMatchName = util.getline(in); util.gobble(in);
838 out << BestMatchName << endl;
839
840 while (!in.eof()) {
841
842 if (m->getControl_pressed()) { in.close(); out.close(); util.mothurRemove((outputFileName+".temp")); return 0; }
843
844 bool print = false;
845 in >> index; util.gobble(in);
846
847 if (index != "SequenceIndex") { //if you are not a header line, there will be a header line for each group if group file is given
848 in >> name; util.gobble(in);
849 in >> DiffsToBestMatch; util.gobble(in);
850 in >> BestMatchIndex; util.gobble(in);
851 in >> BestMatchName; util.gobble(in);
852 in >> DiffstToChimera; util.gobble(in);
853 in >> IndexofLeftParent; util.gobble(in);
854 in >> IndexOfRightParent; util.gobble(in);
855 in >> parent1; util.gobble(in);
856 in >> parent2; util.gobble(in);
857 in >> temp1 >> temp2 >> temp3 >> temp4 >> temp5 >> temp6 >> temp7 >> temp8 >> flag; util.gobble(in);
858
859
860 //is this name already in the file
861 set<string>::iterator itNames = namesInFile.find((name));
862
863 if (itNames == namesInFile.end()) { //no not in file
864 if (flag == "good") { //are you really a no??
865 //is this sequence really not chimeric??
866 set<string>::iterator itChimeras = chimerasInFile.find(name);
867
868 //then you really are a no so print, otherwise skip
869 if (itChimeras == chimerasInFile.end()) { print = true; }
870 }else{ print = true; }
871 }
872
873 if (print) {
874 namesInFile.insert(name);
875 out << index << '\t' << name << '\t' << DiffsToBestMatch << '\t' << BestMatchIndex << '\t';
876 out << BestMatchName << '\t' << DiffstToChimera << '\t' << IndexofLeftParent << '\t' << IndexOfRightParent << '\t' << parent1 << '\t' << parent2 << '\t';
877 out << temp1 << '\t' << temp2 << '\t' << temp3 << '\t' << temp4 << '\t' << temp5 << '\t' << temp6 << '\t' << temp7 << '\t' << temp8 << '\t' << flag << endl;
878 }
879 }else { index = util.getline(in); util.gobble(in); }
880 }
881 in.close();
882 out.close();
883
884 util.mothurRemove(outputFileName);
885 rename((outputFileName+".temp").c_str(), outputFileName.c_str());
886
887 return total;
888 }
889 catch(exception& e) {
890 m->errorOut(e, "ChimeraPerseusCommand", "deconvoluteResults");
891 exit(1);
892 }
893 }
894 //**********************************************************************************************************************
895
896
897