1 /*
2  *  chimeraperseuscommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 10/26/11.
6  *  Copyright 2011 Schloss Lab. All rights reserved.
7  *
8  */
9 
10 #include "chimeraperseuscommand.h"
11 #include "deconvolutecommand.h"
12 #include "sequence.hpp"
13 #include "counttable.h"
14 #include "sequencecountparser.h"
15 //**********************************************************************************************************************
setParameters()16 vector<string> ChimeraPerseusCommand::setParameters(){
17 	try {
18 		CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none","chimera-accnos",false,true,true); parameters.push_back(pfasta);
19 		CommandParameter pname("name", "InputTypes", "", "", "NameCount", "NameCount", "none","",false,false,true); parameters.push_back(pname);
20         CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "NameCount", "none","",false,false,true); parameters.push_back(pcount);
21 		CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none","",false,false,true); parameters.push_back(pgroup);
22 		CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
23         CommandParameter pdups("dereplicate", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pdups);
24 
25 		CommandParameter pseed("seed", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pseed);
26         CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
27 		CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
28 		CommandParameter pcutoff("cutoff", "Number", "", "0.5", "", "", "","",false,false); parameters.push_back(pcutoff);
29 		CommandParameter palpha("alpha", "Number", "", "-5.54", "", "", "","",false,false); parameters.push_back(palpha);
30 		CommandParameter pbeta("beta", "Number", "", "0.33", "", "", "","",false,false); parameters.push_back(pbeta);
31 
32         abort = false; calledHelp = false;
33 
34         vector<string> tempOutNames;
35         outputTypes["chimera"] = tempOutNames;
36         outputTypes["accnos"] = tempOutNames;
37         outputTypes["count"] = tempOutNames;
38 
39 		vector<string> myArray;
40 		for (int i = 0; i < parameters.size(); i++) {	myArray.push_back(parameters[i].name);		}
41 		return myArray;
42 	}
43 	catch(exception& e) {
44 		m->errorOut(e, "ChimeraPerseusCommand", "setParameters");
45 		exit(1);
46 	}
47 }
48 //**********************************************************************************************************************
getHelpString()49 string ChimeraPerseusCommand::getHelpString(){
50 	try {
51 		string helpString = "";
52 		helpString += "The chimera.perseus command reads a fastafile and namefile or countfile and outputs potentially chimeric sequences.\n";
53 		helpString += "The chimera.perseus command parameters are fasta, name, group, cutoff, processors, dereplicate, alpha and beta.\n";
54 		helpString += "The fasta parameter allows you to enter the fasta file containing your potentially chimeric sequences, and is required, unless you have a valid current fasta file. \n";
55 		helpString += "The name parameter allows you to provide a name file associated with your fasta file.\n";
56         helpString += "The count parameter allows you to provide a count file associated with your fasta file. A count or name file is required. When you use a count file with group info and dereplicate=T, mothur will create a *.pick.count_table file containing seqeunces after chimeras are removed.\n";
57 		helpString += "The group parameter allows you to provide a group file.  When checking sequences, only sequences from the same group as the query sequence will be used as the reference. \n";
58 		helpString += "The processors parameter allows you to specify how many processors you would like to use.  The default is 1. \n";
59         helpString += "If the dereplicate parameter is false, then if one group finds the seqeunce to be chimeric, then all groups find it to be chimeric, default=f.\n";
60 		helpString += "The alpha parameter ....  The default is -5.54. \n";
61 		helpString += "The beta parameter ....  The default is 0.33. \n";
62 		helpString += "The cutoff parameter ....  The default is 0.50. \n";
63 		helpString += "The chimera.perseus command should be in the following format: \n";
64 		helpString += "chimera.perseus(fasta=yourFastaFile, name=yourNameFile) \n";
65 		helpString += "Example: chimera.perseus(fasta=AD.align, name=AD.names) \n";
66 
67 		return helpString;
68 	}
69 	catch(exception& e) {
70 		m->errorOut(e, "ChimeraPerseusCommand", "getHelpString");
71 		exit(1);
72 	}
73 }
74 //**********************************************************************************************************************
getOutputPattern(string type)75 string ChimeraPerseusCommand::getOutputPattern(string type) {
76     try {
77         string pattern = "";
78 
79         if (type == "chimera") {  pattern = "[filename],perseus.chimeras"; }
80         else if (type == "accnos") {  pattern = "[filename],perseus.accnos"; }
81         else if (type == "count") {  pattern = "[filename],perseus.pick.count_table"; }
82         else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->setControl_pressed(true);  }
83 
84         return pattern;
85     }
86     catch(exception& e) {
87         m->errorOut(e, "ChimeraPerseusCommand", "getOutputPattern");
88         exit(1);
89     }
90 }
91 //***************************************************************************************************************
ChimeraPerseusCommand(string option)92 ChimeraPerseusCommand::ChimeraPerseusCommand(string option) : Command()  {
93 	try {
94         hasCount = false;
95 
96 		//allow user to run help
97 		if(option == "help") { help(); abort = true; calledHelp = true; }
98 		else if(option == "citation") { citation(); abort = true; calledHelp = true;}
99         else if(option == "category") {  abort = true; calledHelp = true;  }
100 
101 		else {
102 			OptionParser parser(option, setParameters());
103 			map<string,string> parameters = parser.getParameters();
104 
105 			//check for required parameters
106             ValidParameters validParameter;
107             fastafile = validParameter.validFile(parameters, "fasta");
108             if (fastafile == "not found") {
109                 fastafile = current->getFastaFile();
110                 if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter.\n"); }
111                 else { 	m->mothurOut("[ERROR]: You have no current fasta file and the fasta parameter is required.\n");  abort = true; }
112             }
113             else if (fastafile == "not open") { abort = true; }
114             else { current->setFastaFile(fastafile); }
115 
116             bool hasName = false;
117             string namefile = validParameter.validFile(parameters, "name");
118             if (namefile == "not open") { namefile = ""; abort = true; }
119             else if (namefile == "not found") {  namefile = "";  }
120             else { current->setNameFile(namefile); }
121             if (namefile != "") { hasName = true; }
122 
123             countfile = validParameter.validFile(parameters, "count");
124             if (countfile == "not open") { countfile = ""; abort = true; }
125             else if (countfile == "not found") { countfile = "";  }
126             else { current->setCountFile(countfile); }
127             if (countfile != "") { hasCount = true; }
128 
129 			//make sure there is at least one valid file left
130             if (hasName && hasCount) { m->mothurOut("[ERROR]: You must enter ONLY ONE of the following: count or name.\n");  abort = true; }
131 
132             if (!hasName && !hasCount) {
133                 //if there is a current name file, use it, else look for current count file
134 				string filename = current->getNameFile();
135 				if (filename != "") { hasName = true; namefile = filename; m->mothurOut("Using " + filename + " as input file for the name parameter.\n"); }
136 				else {
137                     filename = current->getCountFile();
138                     if (filename != "") { hasCount = true; countfile = filename; m->mothurOut("Using " + filename + " as input file for the count parameter.\n"); }
139                     else { m->mothurOut("[ERROR]: You must provide a count or name file.\n");  abort = true;  }
140                 }
141             }
142 
143 			bool hasGroup = false;
144             string groupfile = validParameter.validFile(parameters, "group");
145             if (groupfile == "not open") { abort = true; }
146             else if (groupfile == "not found") {  groupfile = "";  }
147             else { current->setGroupFile(groupfile); hasGroup = true; }
148 
149             if (hasGroup && hasCount) { m->mothurOut("[ERROR]: You must enter ONLY ONE of the following: count or group.\n");  abort = true; }
150 
151 			string temp = validParameter.valid(parameters, "processors");	if (temp == "not found"){	temp = current->getProcessors();	}
152 			processors = current->setProcessors(temp);
153 
154 			temp = validParameter.valid(parameters, "cutoff");	if (temp == "not found"){	temp = "0.50";	}
155 			util.mothurConvert(temp, cutoff);
156 
157 			temp = validParameter.valid(parameters, "alpha");	if (temp == "not found"){	temp = "-5.54";	}
158 			util.mothurConvert(temp, alpha);
159 
160 			temp = validParameter.valid(parameters, "beta");	if (temp == "not found"){	temp = "0.33";	}
161 			util.mothurConvert(temp, beta);
162 
163 			temp = validParameter.valid(parameters, "dereplicate");
164 			if (temp == "not found") { temp = "false";			}
165 			dups = util.isTrue(temp);
166 
167             if (!abort) {
168                 if ((namefile != "") || (groupfile != "")) { //convert to count
169 
170                     string rootFileName = namefile;
171                     if (rootFileName == "") { rootFileName = groupfile; }
172 
173                     if (outputdir == "") { outputdir = util.hasPath(rootFileName); }
174                     string outputFileName = outputdir + util.getRootName(util.getSimpleName(rootFileName)) + "count_table";
175 
176                     CountTable ct; ct.createTable(namefile, groupfile, nullVector); ct.printCompressedTable(outputFileName);
177                     outputNames.push_back(outputFileName);
178 
179                     current->setCountFile(outputFileName);
180                     countfile = outputFileName;
181                     hasCount = true;
182                 }
183             }
184 		}
185 	}
186 	catch(exception& e) {
187 		m->errorOut(e, "ChimeraPerseusCommand", "ChimeraPerseusCommand");
188 		exit(1);
189 	}
190 }
191 /**************************************************************************************************/
192 struct perseusData {
193     Utils util;
194     MothurOut* m;
195     vector<seqData> sequences;
196     string group;
197     int count, numChimeras;
198     string chimeraFileName;
199     string accnosFileName;
200     double alpha, beta, cutoff;
201 
perseusDataperseusData202     perseusData(string cf, string ac, double a, double b, double c){
203         m = MothurOut::getInstance();
204         count = 0;
205         numChimeras = 0;
206         accnosFileName = ac;
207         chimeraFileName = cf;
208         alpha = a;
209         beta = b;
210         cutoff = c;
211     }
212 };
213 //**********************************************************************************************************************
214 //void driver(string chimeraFileName, vector<seqData>& sequences, string accnosFileName, int& numChimeras){
driver(perseusData * params)215 void driver(perseusData* params){
216     try {
217         vector<vector<double> > correctModel(4);	//could be an option in the future to input own model matrix
218         for(int i=0;i<4;i++){	correctModel[i].resize(4);	}
219 
220         correctModel[0][0] = 0.000000;	//AA
221         correctModel[1][0] = 11.619259;	//CA
222         correctModel[2][0] = 11.694004;	//TA
223         correctModel[3][0] = 7.748623;	//GA
224 
225         correctModel[1][1] = 0.000000;	//CC
226         correctModel[2][1] = 7.619657;	//TC
227         correctModel[3][1] = 12.852562;	//GC
228 
229         correctModel[2][2] = 0.000000;	//TT
230         correctModel[3][2] = 10.964048;	//TG
231 
232         correctModel[3][3] = 0.000000;	//GG
233 
234         for(int i=0;i<4;i++){ for(int j=0;j<i;j++){ correctModel[j][i] = correctModel[i][j]; } }
235 
236         int numSeqs = params->sequences.size();
237         int alignLength = params->sequences[0].sequence.size();
238 
239         ofstream chimeraFile;
240         ofstream accnosFile;
241         params->util.openOutputFile(params->chimeraFileName, chimeraFile);
242         params->util.openOutputFile(params->accnosFileName, accnosFile);
243 
244         Perseus myPerseus;
245         vector<vector<double> > binMatrix = myPerseus.binomial(alignLength);
246 
247         chimeraFile << "SequenceIndex\tName\tDiffsToBestMatch\tBestMatchIndex\tBestMatchName\tDiffstToChimera\tIndexofLeftParent\tIndexOfRightParent\tNameOfLeftParent\tNameOfRightParent\tDistanceToBestMatch\tcIndex\t(cIndex - singleDist)\tloonIndex\tMismatchesToChimera\tMismatchToTrimera\tChimeraBreakPoint\tLogisticProbability\tTypeOfSequence\n";
248 
249         vector<bool> chimeras(numSeqs, 0);
250 
251         for(int i=0;i<numSeqs;i++){
252             if (params->m->getControl_pressed()) { chimeraFile.close(); accnosFile.close(); break; }
253 
254             vector<bool> restricted = chimeras;
255 
256             vector<vector<int> > leftDiffs(numSeqs);
257             vector<vector<int> > leftMaps(numSeqs);
258             vector<vector<int> > rightDiffs(numSeqs);
259             vector<vector<int> > rightMaps(numSeqs);
260 
261             vector<int> singleLeft, bestLeft;
262             vector<int> singleRight, bestRight;
263 
264             int bestSingleIndex, bestSingleDiff;
265             vector<pwAlign> alignments(numSeqs);
266 
267             int comparisons = myPerseus.getAlignments(i, params->sequences, alignments, leftDiffs, leftMaps, rightDiffs, rightMaps, bestSingleIndex, bestSingleDiff, restricted);
268             if (params->m->getControl_pressed()) { chimeraFile.close(); accnosFile.close(); break; }
269 
270             int minMismatchToChimera, leftParentBi, rightParentBi, breakPointBi;
271 
272             string dummyA, dummyB;
273 
274             if (params->sequences[i].sequence.size() < 3) {
275                 chimeraFile << i << '\t' << params->sequences[i].seqName << "\t0\t0\tNull\t0\t0\t0\tNull\tNull\t0.0\t0.0\t0.0\t0\t0\t0\t0.0\t0.0\tgood" << endl;
276             }else if(comparisons >= 2){
277                 minMismatchToChimera = myPerseus.getChimera(params->sequences, leftDiffs, rightDiffs, leftParentBi, rightParentBi, breakPointBi, singleLeft, bestLeft, singleRight, bestRight, restricted);
278                 if (params->m->getControl_pressed()) { chimeraFile.close(); accnosFile.close(); break; }
279 
280                 int minMismatchToTrimera = numeric_limits<int>::max();
281                 int leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB;
282 
283                 if(minMismatchToChimera >= 3 && comparisons >= 3){
284                     minMismatchToTrimera = myPerseus.getTrimera(params->sequences, leftDiffs, leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB, singleLeft, bestLeft, singleRight, bestRight, restricted);
285                     if (params->m->getControl_pressed()) { chimeraFile.close(); accnosFile.close(); break; }
286                 }
287 
288                 double singleDist = myPerseus.modeledPairwiseAlignSeqs(params->sequences[i].sequence, params->sequences[bestSingleIndex].sequence, dummyA, dummyB, correctModel);
289 
290                 if (params->m->getControl_pressed()) { chimeraFile.close(); accnosFile.close(); break; }
291 
292                 string type;
293                 string chimeraRefSeq;
294 
295                 if(minMismatchToChimera - minMismatchToTrimera >= 3){
296                     type = "trimera";
297                     chimeraRefSeq = myPerseus.stitchTrimera(alignments, leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB, leftMaps, rightMaps);
298                 }
299                 else{
300                     type = "chimera";
301                     chimeraRefSeq = myPerseus.stitchBimera(alignments, leftParentBi, rightParentBi, breakPointBi, leftMaps, rightMaps);
302                 }
303 
304                 if (params->m->getControl_pressed()) { chimeraFile.close(); accnosFile.close(); break; }
305 
306                 double chimeraDist = myPerseus.modeledPairwiseAlignSeqs(params->sequences[i].sequence, chimeraRefSeq, dummyA, dummyB, correctModel);
307 
308                 if (params->m->getControl_pressed()) { chimeraFile.close(); accnosFile.close(); break; }
309 
310                 double cIndex = chimeraDist;//modeledPairwiseAlignSeqs(sequences[i].sequence, chimeraRefSeq);
311                 double loonIndex = myPerseus.calcLoonIndex(params->sequences[i].sequence, params->sequences[leftParentBi].sequence, params->sequences[rightParentBi].sequence, breakPointBi, binMatrix);
312 
313                 if (params->m->getControl_pressed()) { chimeraFile.close(); accnosFile.close(); break; }
314 
315                 chimeraFile << i << '\t' << params->sequences[i].seqName << '\t' << bestSingleDiff << '\t' << bestSingleIndex << '\t' << params->sequences[bestSingleIndex].seqName << '\t';
316                 chimeraFile << minMismatchToChimera << '\t' << leftParentBi << '\t' << rightParentBi << '\t' << params->sequences[leftParentBi].seqName << '\t' << params->sequences[rightParentBi].seqName << '\t';
317                 chimeraFile << singleDist << '\t' << cIndex << '\t' << (cIndex - singleDist) << '\t' << loonIndex << '\t';
318                 chimeraFile << minMismatchToChimera << '\t' << minMismatchToTrimera << '\t' << breakPointBi << '\t';
319 
320                 double probability = myPerseus.classifyChimera(singleDist, cIndex, loonIndex, params->alpha, params->beta);
321 
322                 chimeraFile << probability << '\t';
323 
324                 if(probability > params->cutoff){
325                     chimeraFile << type << endl;
326                     accnosFile << params->sequences[i].seqName << endl;
327                     chimeras[i] = 1;
328                     params->numChimeras++;
329                 }
330                 else{ chimeraFile << "good" << endl; }
331             }
332             else{
333                 chimeraFile << i << '\t' << params->sequences[i].seqName << "\t0\t0\tNull\t0\t0\t0\tNull\tNull\t0.0\t0.0\t0.0\t0\t0\t0\t0.0\t0.0\tgood" << endl;
334             }
335 
336             //report progress
337             if((i+1) % 100 == 0){ 	params->m->mothurOutJustToScreen("Processing sequence: " + toString(i+1) + "\n");		}
338             params->count++; //# of sequences completed. Used by calling function to check for failure
339         }
340 
341         if((numSeqs) % 100 != 0){ 	params->m->mothurOutJustToScreen("Processing sequence: " + toString(numSeqs) + "\n");		}
342 
343         if (!params->m->getControl_pressed()) { chimeraFile.close(); accnosFile.close(); }
344     }
345     catch(exception& e) {
346         params->m->errorOut(e, "ChimeraPerseusCommand", "driver");
347         exit(1);
348     }
349 }
350 //***************************************************************************************************************
351 
execute()352 int ChimeraPerseusCommand::execute(){
353 	try{
354 		if (abort) { if (calledHelp) { return 0; }  return 2;	}
355 
356         m->mothurOut("Checking sequences from " + fastafile + " ...\n" );
357 
358         long start = time(NULL);
359         if (outputdir == "") { outputdir = util.hasPath(fastafile);  }
360         map<string, string> variables;
361         variables["[filename]"] = outputdir + util.getRootName(util.getSimpleName(fastafile));
362         string outputFileName = getOutputFileName("chimera", variables);
363         string accnosFileName = getOutputFileName("accnos", variables);
364         string newCountFile = "";
365 
366         if (countfile == "") { countfile = getCountFile(fastafile); hasCount=true; }
367 
368         if (m->getControl_pressed()) {  return 0;	}
369 
370         int numSeqs = 0; int numChimeras = 0;
371 
372         if (hasCount) {
373             CountTable ct;
374             vector<string> groups;
375             if (ct.testGroups(countfile, groups)) { //fills groups if count file has them
376 
377                 variables["[filename]"] = outputdir + util.getRootName(util.getSimpleName(countfile));
378                 newCountFile = getOutputFileName("count", variables);
379 
380                 vector<string> groups;
381                 map<string, vector<string> > group2Files;
382                 if (hasCount) {
383                     current->setMothurCalling(true);
384                     SequenceCountParser cparser(countfile, fastafile, nullVector);
385                     current->setMothurCalling(false);
386                     groups = cparser.getNamesOfGroups();
387                     group2Files = cparser.getFiles();
388 
389                 }
390 
391                 if (m->getControl_pressed()) { return 0; }
392 
393                 //clears files
394                 ofstream out, out1, out2;
395                 util.openOutputFile(outputFileName, out); out.close();
396                 util.openOutputFile(accnosFileName, out1); out1.close();
397 
398                 string countlist = accnosFileName+".byCount";
399                 numSeqs = createProcessesGroups(group2Files, outputFileName, countlist, accnosFileName, newCountFile, groups, fastafile, countfile, numChimeras);
400 
401                 if (m->getControl_pressed()) {  for (int j = 0; j < outputNames.size(); j++) {	util.mothurRemove(outputNames[j]);	}  return 0;	}
402 
403                 if (!dups) {
404                     numChimeras = deconvoluteResults(outputFileName, accnosFileName);
405                 }else {
406                     CountTable newCount; newCount.readTable(countfile, true, false);
407 
408                     if (!util.isBlank(countlist)) {
409                         ifstream in2;
410                         util.openInputFile(countlist, in2);
411 
412                         string name, group;
413                         while (!in2.eof()) {
414                             in2 >> name; util.gobble(in2); in2 >> group; util.gobble(in2);
415                             newCount.setAbund(name, group, 0);
416                         }
417                         in2.close();
418                     }
419                     util.mothurRemove(countlist);
420 
421                     //print new *.pick.count_table
422                     vector<string> namesInTable = newCount.printTable(newCountFile);  //returns non zeroed names
423                     outputNames.push_back(newCountFile); outputTypes["count"].push_back(newCountFile);
424 
425                     set<string> doNotRemove = util.mothurConvert(namesInTable);
426 
427                     //remove names we want to keep from accnos file.
428                     set<string> accnosNames = util.readAccnos(accnosFileName);
429                     ofstream out2; util.openOutputFile(accnosFileName, out2);
430                     for (set<string>::iterator it = accnosNames.begin(); it != accnosNames.end(); it++) { if (doNotRemove.count(*it) == 0) {  out2 << (*it) << endl; } }
431                     out2.close();
432                 }
433 
434                 util.mothurRemove(countlist);
435                 m->mothurOut("The number of sequences checked may be larger than the number of unique sequences because some sequences are found in several samples.\n");
436 
437                 if (m->getControl_pressed()) {   for (int j = 0; j < outputNames.size(); j++) {	util.mothurRemove(outputNames[j]);	}  return 0;  }
438 
439             }else {
440                 if (processors != 1) { m->mothurOut("Your count file does not contain group information, mothur can only use 1 processor, continuing.\n");  processors = 1; }
441 
442                 //read sequences and store sorted by frequency
443                 ct.readTable(countfile, false, false);
444                 vector<seqData> sequences = readFiles(fastafile, ct.getNameMap());
445 
446                 if (m->getControl_pressed()) {  for (int j = 0; j < outputNames.size(); j++) {	util.mothurRemove(outputNames[j]);	} return 0; }
447 
448                 perseusData* dataBundle = new perseusData(outputFileName, accnosFileName, alpha, beta, cutoff);
449                 dataBundle->sequences = sequences;
450                 driver(dataBundle);
451                 numSeqs = dataBundle->count; numChimeras = dataBundle->numChimeras;
452                 delete dataBundle;
453             }
454         }
455 
456         if (m->getControl_pressed()) { for (int j = 0; j < outputNames.size(); j++) {	util.mothurRemove(outputNames[j]);	} return 0; }
457 
458         m->mothurOut("\nIt took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences. " + toString(numChimeras) + " chimeras were found.\n");
459         outputNames.push_back(outputFileName); outputTypes["chimera"].push_back(outputFileName);
460         outputNames.push_back(accnosFileName); outputTypes["accnos"].push_back(accnosFileName);
461 
462 
463 		//set accnos file as new current accnosfile
464 		string currentName = "";
465 		itTypes = outputTypes.find("accnos");
466 		if (itTypes != outputTypes.end()) {
467 			if ((itTypes->second).size() != 0) { currentName = (itTypes->second)[0]; current->setAccnosFile(currentName); }
468 		}
469 
470         itTypes = outputTypes.find("count");
471 		if (itTypes != outputTypes.end()) {
472 			if ((itTypes->second).size() != 0) { currentName = (itTypes->second)[0]; current->setCountFile(currentName); }
473 		}
474 
475 		m->mothurOut("\nOutput File Names: \n");
476 		for (int i = 0; i < outputNames.size(); i++) {	m->mothurOut(outputNames[i]); m->mothurOutEndLine();	}
477 		m->mothurOutEndLine();
478 
479 		return 0;
480 
481 	}
482 	catch(exception& e) {
483 		m->errorOut(e, "ChimeraPerseusCommand", "execute");
484 		exit(1);
485 	}
486 }
487 //**********************************************************************************************************************
getCountFile(string & inputFile)488 string ChimeraPerseusCommand::getCountFile(string& inputFile){
489 	try {
490 		string countFile = "";
491 
492 		m->mothurOut("\nNo count file given, running unique.seqs command to generate one.\n\n");
493 
494 		//use unique.seqs to create new name and fastafile
495 		string inputString = "format=count, fasta=" + inputFile;
496 		m->mothurOut("/******************************************/\n");
497 		m->mothurOut("Running command: unique.seqs(" + inputString + ")\n");
498 		current->setMothurCalling(true);
499 
500 		Command* uniqueCommand = new DeconvoluteCommand(inputString);
501 		uniqueCommand->execute();
502 
503 		map<string, vector<string> > filenames = uniqueCommand->getOutputFiles();
504 
505 		delete uniqueCommand;
506 		current->setMothurCalling(false);
507 		m->mothurOut("/******************************************/\n");
508 
509 		countFile = filenames["count"][0];
510 		inputFile = filenames["fasta"][0];
511 
512 		return countFile;
513 	}
514 	catch(exception& e) {
515 		m->errorOut(e, "ChimeraPerseusCommand", "getNamesFile");
516 		exit(1);
517 	}
518 }
519 /**************************************************************************************************/
520 struct perseusGroupsData {
521     string fastafile;
522     string dupsfile;
523     string chimeraFileName;
524     string accnosFileName;
525     string countlist;
526     map<string, vector<string> > parsedFiles;
527 
528     bool hasCount, dups;
529     int threadID, count, numChimeras;
530     double alpha, beta, cutoff;
531     vector<string> groups;
532     Utils util;
533     MothurOut* m;
534 
perseusGroupsDataperseusGroupsData535     perseusGroupsData(){}
perseusGroupsDataperseusGroupsData536     perseusGroupsData(map<string, vector<string> >& g2f,bool dps, bool hc, double a, double b, double c, string o,  string f, string n, string ac, string ctlist, vector<string> gr, int tid) {
537         alpha = a;
538         beta = b;
539         cutoff = c;
540         fastafile = f;
541         dupsfile = n;
542         chimeraFileName = o;
543         countlist = ctlist;
544         accnosFileName = ac;
545         m = MothurOut::getInstance();
546         threadID = tid;
547         groups = gr;
548         hasCount = hc;
549         dups = dps;
550         count = 0;
551         numChimeras = 0;
552         parsedFiles = g2f;
553     }
554 };
555 //**********************************************************************************************************************
loadSequences(map<string,int> & nameMap,string thisGroupsFastaFile,perseusGroupsData * params)556 vector<seqData> loadSequences(map<string, int>& nameMap, string thisGroupsFastaFile, perseusGroupsData* params){
557     try {
558         bool error = false;
559         vector<seqData> sequences;
560 
561         ifstream in;
562         params->util.openInputFile(thisGroupsFastaFile, in);
563 
564         vector<seqPriorityNode> nameVector;
565         map<string, int>::iterator itNameMap;
566         while (!in.eof()) {
567             if (params->m->getControl_pressed()) { break; }
568 
569             Sequence seq(in); params->util.gobble(in);
570 
571             itNameMap = nameMap.find(seq.getName());
572 
573             if (itNameMap == nameMap.end()){
574                 error = true;
575                 params->m->mothurOut("[ERROR]: " + seq.getName() + " is in your fastafile, but is not in your name or count file, please correct.\n");
576             }else {
577                 int num = itNameMap->second;
578 
579                 seq.setAligned(params->util.removeNs(seq.getUnaligned()));
580                 sequences.push_back(seqData(seq.getName(), seq.getUnaligned(), num));
581             }
582         }
583         in.close();
584 
585         if (error) { params->m->setControl_pressed(true); }
586 
587         //sort by frequency
588         sort(sequences.rbegin(), sequences.rend());
589 
590         return sequences;
591     }
592     catch(exception& e) {
593         params->m->errorOut(e, "ChimeraPerseusCommand", "loadSequences");
594         exit(1);
595     }
596 }
597 
598 //**********************************************************************************************************************
599 //string outputFName, string accnos, string countlist, int start, int end, vector<string> groups
driverGroups(perseusGroupsData * params)600 void driverGroups(perseusGroupsData* params){
601 	try {
602         //clears files
603         ofstream out, out1, out2;
604         params->util.openOutputFile(params->chimeraFileName, out); out.close();
605         params->util.openOutputFile(params->accnosFileName, out1); out1.close();
606 
607 		int totalSeqs = 0;
608         ofstream outCountList;
609         if (params->hasCount && params->dups) { params->util.openOutputFile(params->countlist, outCountList); }
610 
611         for (map<string, vector<string> >::iterator it = params->parsedFiles.begin(); it != params->parsedFiles.end(); it++) {
612             long start = time(NULL);	 if (params->m->getControl_pressed()) {  break; }
613 
614 
615             string thisGroup = it->first;
616 
617             map<string, int> nameMap;
618             if (params->hasCount) {
619                 CountTable ct; ct.readTable(it->second[1], false, true);
620                 nameMap = ct.getNameMap();
621             }
622 
623 			params->m->mothurOut("\nChecking sequences from group " + thisGroup + "...\n");
624 
625             perseusData* driverParams = new perseusData((params->chimeraFileName+thisGroup), (params->accnosFileName+thisGroup), params->alpha, params->beta, params->cutoff);
626 			driverParams->sequences = loadSequences(nameMap, it->second[0], params);
627 
628             if (params->m->getControl_pressed()) { break; }
629 
630 			driver(driverParams);
631 			totalSeqs += driverParams->count;
632             params->numChimeras += driverParams->numChimeras;
633 
634 			if (params->m->getControl_pressed()) { break; }
635 
636             if (params->dups) {
637                 if (!params->util.isBlank(driverParams->accnosFileName)) {
638                     ifstream in;
639                     params->util.openInputFile(driverParams->accnosFileName, in);
640                     string name;
641                     if (params->hasCount) {
642                         while (!in.eof()) {
643                             in >> name; params->util.gobble(in);
644                             outCountList << name << '\t' << thisGroup << endl;
645                         }
646                         in.close();
647                     }
648                 }
649             }
650 
651 			//append files
652 			params->util.appendFiles(driverParams->chimeraFileName, params->chimeraFileName); params->util.mothurRemove(driverParams->chimeraFileName);
653 			params->util.appendFiles(driverParams->accnosFileName, params->accnosFileName); params->util.mothurRemove(driverParams->accnosFileName);
654 
655 			params->m->mothurOut("\nIt took " + toString(time(NULL) - start) + " secs to check " + toString(driverParams->count) + " sequences from group " + thisGroup + ".\n");
656             delete driverParams;
657 		}
658 
659         if (params->hasCount && params->dups) { outCountList.close(); }
660 
661 		params->count = totalSeqs;
662 
663 	}
664 	catch(exception& e) {
665 		params->m->errorOut(e, "ChimeraPerseusCommand", "driverGroups");
666 		exit(1);
667 	}
668 }
669 //**********************************************************************************************************************
readFiles(string inputFile,map<string,int> nameMap)670 vector<seqData> ChimeraPerseusCommand::readFiles(string inputFile, map<string, int> nameMap){
671 	try {
672 		map<string, int>::iterator it;
673 
674 		//read fasta file and create sequenceData structure - checking for file mismatches
675 		vector<seqData> sequences;
676 		bool error = false;
677 		ifstream in;
678 		util.openInputFile(inputFile, in);
679 		alignLength = 0;
680 
681 		while (!in.eof()) {
682 
683 			if (m->getControl_pressed()) { in.close(); return sequences; }
684 
685 			Sequence temp(in); util.gobble(in);
686 
687 			it = nameMap.find(temp.getName());
688 			if (it == nameMap.end()) { error = true; m->mothurOut("[ERROR]: " + temp.getName() + " is in your fasta file and not in your namefile, please correct.\n");  }
689 			else {
690                 temp.setAligned(util.removeNs(temp.getUnaligned()));
691 				sequences.push_back(seqData(temp.getName(), temp.getUnaligned(), it->second));
692                 if (temp.getUnaligned().length() > alignLength) { alignLength = temp.getUnaligned().length(); }
693 			}
694 		}
695 		in.close();
696 
697 		if (error) { m->setControl_pressed(true); }
698 
699 		//sort by frequency
700 		sort(sequences.rbegin(), sequences.rend());
701 
702 		return sequences;
703 	}
704 	catch(exception& e) {
705 		m->errorOut(e, "ChimeraPerseusCommand", "readFiles");
706 		exit(1);
707 	}
708 }
709 /**************************************************************************************************/
710 //perseusData(vector<seqData>& s, double a, double b, double c, string o, string ac, MothurOut* mout)
711 //numSeqs = createProcessesGroups(outputFileName, countlist, accnosFileName, newCountFile, groups, fastafile, countfile, numChimeras);
createProcessesGroups(map<string,vector<string>> & parsedFiles,string outputFName,string countlisttemp,string accnos,string newCountFile,vector<string> groups,string fasta,string dupsFile,int & numChimeras)712 int ChimeraPerseusCommand::createProcessesGroups(map<string, vector<string> >& parsedFiles, string outputFName, string countlisttemp, string accnos, string newCountFile, vector<string> groups, string fasta, string dupsFile, int& numChimeras) {
713 	try {
714         numChimeras = 0;
715 
716 		//sanity check
717 		if (groups.size() < processors) { processors = groups.size(); m->mothurOut("Reducing processors to " + toString(groups.size()) + ".\n"); }
718 
719 		//divide the groups between the processors
720 		vector<linePair> lines;
721 		int remainingPairs = groups.size();
722         int startIndex = 0;
723         for (int remainingProcessors = processors; remainingProcessors > 0; remainingProcessors--) {
724             int numPairs = remainingPairs; //case for last processor
725             if (remainingProcessors != 1) { numPairs = ceil(remainingPairs / remainingProcessors); }
726             lines.push_back(linePair(startIndex, (startIndex+numPairs))); //startIndex, endIndex
727             startIndex = startIndex + numPairs;
728             remainingPairs = remainingPairs - numPairs;
729         }
730 
731         //create array of worker threads
732         vector<std::thread*> workerThreads;
733         vector<perseusGroupsData*> data;
734 
735         long long num = 0;
736         time_t start, end;
737         time(&start);
738 
739         //Lauch worker threads
740         for (int i = 0; i < processors-1; i++) {
741             string extension = toString(i+1) + ".temp";
742             vector<string> thisGroups;
743             map<string, vector<string> > thisGroupsParsedFiles;
744             for (int j = lines[i+1].start; j < lines[i+1].end; j++) {
745 
746                 map<string, vector<string> >::iterator it = parsedFiles.find(groups[j]);
747                 if (it != parsedFiles.end()) {
748                     thisGroupsParsedFiles[groups[j]] = (it->second);
749                     thisGroups.push_back(groups[j]);
750                 }
751                 else { m->mothurOut("[ERROR]: missing files for group " + groups[j] + ", skipping\n"); }
752             }
753             perseusGroupsData* dataBundle = new perseusGroupsData(thisGroupsParsedFiles, dups, hasCount, alpha, beta, cutoff, (outputFName+extension), fasta, dupsFile,  (accnos+extension), (countlisttemp+extension), thisGroups, (i+1));
754             data.push_back(dataBundle);
755 
756             workerThreads.push_back(new std::thread(driverGroups, dataBundle));
757         }
758 
759         vector<string> thisGroups;
760         map<string, vector<string> > thisGroupsParsedFiles;
761         for (int j = lines[0].start; j < lines[0].end; j++) {
762 
763             map<string, vector<string> >::iterator it = parsedFiles.find(groups[j]);
764             if (it != parsedFiles.end()) {
765                 thisGroupsParsedFiles[groups[j]] = (it->second);
766                 thisGroups.push_back(groups[j]);
767             }
768             else { m->mothurOut("[ERROR]: missing files for group " + groups[j] + ", skipping\n"); }
769         }
770         perseusGroupsData* dataBundle = new perseusGroupsData(thisGroupsParsedFiles, dups, hasCount, alpha, beta, cutoff, outputFName, fasta, dupsFile,  accnos, countlisttemp, thisGroups, 0);
771         driverGroups(dataBundle);
772         num = dataBundle->count;
773         numChimeras = dataBundle->numChimeras;
774 
775         for (int i = 0; i < processors-1; i++) {
776             workerThreads[i]->join();
777             num += data[i]->count;
778             numChimeras += data[i]->numChimeras;
779 
780             string extension = toString(i+1) + ".temp";
781             util.appendFiles((outputFName+extension), outputFName);
782             util.mothurRemove((outputFName+extension));
783 
784             util.appendFiles((accnos+extension), accnos);
785             util.mothurRemove((accnos+extension));
786 
787             util.appendFiles((countlisttemp+extension), countlisttemp);
788             util.mothurRemove((countlisttemp+extension));
789 
790             delete data[i];
791             delete workerThreads[i];
792         }
793         delete dataBundle;
794 
795         time(&end);
796         m->mothurOut("It took " + toString(difftime(end, start)) + " secs to check " + toString(num) + " sequences.\n\n");
797 
798 		return num;
799 
800 	}
801 	catch(exception& e) {
802 		m->errorOut(e, "ChimeraPerseusCommand", "createProcessesGroups");
803 		exit(1);
804 	}
805 }
806 //**********************************************************************************************************************
deconvoluteResults(string outputFileName,string accnosFileName)807 int ChimeraPerseusCommand::deconvoluteResults(string outputFileName, string accnosFileName){
808 	try {
809         int total = 0;
810 
811         set<string> chimerasInFile = util.readAccnos(accnosFileName);//this is so if a sequence is found to be chimera in several samples we dont write it to the results file more than once
812         util.printAccnos(accnosFileName, chimerasInFile);
813 
814 		//edit chimera file
815 		ifstream in;
816 		util.openInputFile(outputFileName, in);
817 
818 		ofstream out;
819 		util.openOutputFile(outputFileName+".temp", out); out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
820 
821 		int DiffsToBestMatch, BestMatchIndex, DiffstToChimera, IndexofLeftParent, IndexOfRightParent;
822 		float temp1,temp2, temp3, temp4, temp5, temp6, temp7, temp8;
823 		string index, BestMatchName, parent1, parent2, flag;
824 		string name = "";
825         set<string> namesInFile;
826 
827 		//assumptions - in file each read will always look like
828 		/*
829 		 SequenceIndex	Name	DiffsToBestMatch	BestMatchIndex	BestMatchName	DiffstToChimera	IndexofLeftParent	IndexOfRightParent	NameOfLeftParent	NameOfRightParent	DistanceToBestMatch	cIndex	(cIndex - singleDist)	loonIndex	MismatchesToChimera	MismatchToTrimera	ChimeraBreakPoint	LogisticProbability	TypeOfSequence
830 		 0	F01QG4L02JVBQY	0	0	Null	0	0	0	Null	Null	0.0	0.0	0.0	0.0	0	0	0	0.0	0.0	good
831 		 1	F01QG4L02ICTC6	0	0	Null	0	0	0	Null	Null	0.0	0.0	0.0	0.0	0	0	0	0.0	0.0	good
832 		 2	F01QG4L02JZOEC	48	0	F01QG4L02JVBQY	47	0	0	F01QG4L02JVBQY	F01QG4L02JVBQY	2.0449	2.03545	-0.00944493	0	47	2147483647	138	0	good
833 		 3	F01QG4L02G7JEC	42	0	F01QG4L02JVBQY	40	1	0	F01QG4L02ICTC6	F01QG4L02JVBQY	1.87477	1.81113	-0.0636404	5.80145	40	2147483647	25	0	good
834 		 */
835 
836 		//get and print headers
837 		BestMatchName = util.getline(in); util.gobble(in);
838 		out << BestMatchName << endl;
839 
840 		while (!in.eof()) {
841 
842 			if (m->getControl_pressed()) { in.close(); out.close(); util.mothurRemove((outputFileName+".temp")); return 0; }
843 
844 			bool print = false;
845 			in >> index;	util.gobble(in);
846 
847 			if (index != "SequenceIndex") { //if you are not a header line, there will be a header line for each group if group file is given
848 				in >> name;		util.gobble(in);
849 				in >> DiffsToBestMatch; util.gobble(in);
850 				in >> BestMatchIndex; util.gobble(in);
851 				in >> BestMatchName; util.gobble(in);
852 				in >> DiffstToChimera; util.gobble(in);
853 				in >> IndexofLeftParent; util.gobble(in);
854 				in >> IndexOfRightParent; util.gobble(in);
855 				in >> parent1;	util.gobble(in);
856 				in >> parent2;	util.gobble(in);
857 				in >> temp1 >> temp2 >> temp3 >> temp4 >> temp5 >> temp6 >> temp7 >> temp8 >> flag; util.gobble(in);
858 
859 
860                 //is this name already in the file
861                 set<string>::iterator itNames = namesInFile.find((name));
862 
863                 if (itNames == namesInFile.end()) { //no not in file
864                     if (flag == "good") { //are you really a no??
865                         //is this sequence really not chimeric??
866                         set<string>::iterator itChimeras = chimerasInFile.find(name);
867 
868                         //then you really are a no so print, otherwise skip
869                         if (itChimeras == chimerasInFile.end()) { print = true; }
870                     }else{ print = true; }
871                 }
872 
873 				if (print) {
874                     namesInFile.insert(name);
875 					out << index << '\t' << name  << '\t' << DiffsToBestMatch << '\t' << BestMatchIndex << '\t';
876 					out << BestMatchName << '\t' << DiffstToChimera << '\t' << IndexofLeftParent << '\t' << IndexOfRightParent << '\t' << parent1 << '\t' << parent2 << '\t';
877 					out << temp1 << '\t' << temp2 << '\t' << temp3 << '\t' << temp4 << '\t' << temp5 << '\t' << temp6 << '\t' << temp7 << '\t' << temp8 << '\t' << flag << endl;
878 				}
879 			}else { index = util.getline(in); util.gobble(in); }
880 		}
881 		in.close();
882 		out.close();
883 
884 		util.mothurRemove(outputFileName);
885 		rename((outputFileName+".temp").c_str(), outputFileName.c_str());
886 
887 		return total;
888 	}
889 	catch(exception& e) {
890 		m->errorOut(e, "ChimeraPerseusCommand", "deconvoluteResults");
891 		exit(1);
892 	}
893 }
894 //**********************************************************************************************************************
895 
896 
897