1 /*
2  *  ryanscommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 9/23/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9 
10 #include "clusterfragmentscommand.h"
11 #include "needlemanoverlap.hpp"
12 
13 //**********************************************************************************************************************
14 //sort by unaligned
comparePriority(seqRNode first,seqRNode second)15 inline bool comparePriority(seqRNode first, seqRNode second) {
16 	bool better = false;
17 
18 	if (first.length > second.length) {
19 		better = true;
20 	}else if (first.length == second.length) {
21 		if (first.numIdentical > second.numIdentical) {
22 			better = true;
23 		}
24 	}
25 
26 	return better;
27 }
28 //**********************************************************************************************************************
setParameters()29 vector<string> ClusterFragmentsCommand::setParameters(){
30 	try {
31 		CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none","fasta-name",false,true,true); parameters.push_back(pfasta);
32 		CommandParameter pname("name", "InputTypes", "", "", "namecount", "none", "none","name",false,false,true); parameters.push_back(pname);
33         CommandParameter pcount("count", "InputTypes", "", "", "namecount", "none", "none","count",false,false,true); parameters.push_back(pcount);
34 		CommandParameter pdiffs("diffs", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pdiffs);
35 		CommandParameter ppercent("percent", "Number", "", "0", "", "", "","",false,false); parameters.push_back(ppercent);
36 		CommandParameter pseed("seed", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pseed);
37         CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
38 		CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
39 
40         abort = false; calledHelp = false;
41 
42         vector<string> tempOutNames;
43         outputTypes["fasta"] = tempOutNames;
44         outputTypes["name"] = tempOutNames;
45         outputTypes["count"] = tempOutNames;
46 
47 		vector<string> myArray;
48 		for (int i = 0; i < parameters.size(); i++) {	myArray.push_back(parameters[i].name);		}
49 		return myArray;
50 	}
51 	catch(exception& e) {
52 		m->errorOut(e, "ClusterFragmentsCommand", "setParameters");
53 		exit(1);
54 	}
55 }
56 //**********************************************************************************************************************
getHelpString()57 string ClusterFragmentsCommand::getHelpString(){
58 	try {
59 		string helpString = "";
60 		helpString += "The cluster.fragments command groups sequences that are part of a larger sequence.\n";
61 		helpString += "The cluster.fragments command outputs a new fasta and name or count file.\n";
62 		helpString += "The cluster.fragments command parameters are fasta, name, count, diffs and percent. The fasta parameter is required, unless you have a valid current file. \n";
63 		helpString += "The names parameter allows you to give a list of seqs that are identical. This file is 2 columns, first column is name or representative sequence, second column is a list of its identical sequences separated by commas.\n";
64 		helpString += "The diffs parameter allows you to set the number of differences allowed, default=0. \n";
65 		helpString += "The percent parameter allows you to set percentage of differences allowed, default=0. percent=2 means if the number of difference is less than or equal to two percent of the length of the fragment, then cluster.\n";
66 		helpString += "You may use diffs and percent at the same time to say something like: If the number or differences is greater than 1 or more than 2% of the fragment length, don't merge. \n";
67 		helpString += "The cluster.fragments command should be in the following format: \n";
68 		helpString += "cluster.fragments(fasta=yourFastaFile, names=yourNamesFile) \n";
69 		helpString += "Example cluster.fragments(fasta=amazon.fasta).\n";
70 		;
71 		return helpString;
72 	}
73 	catch(exception& e) {
74 		m->errorOut(e, "ClusterFragmentsCommand", "getHelpString");
75 		exit(1);
76 	}
77 }
78 //**********************************************************************************************************************
getOutputPattern(string type)79 string ClusterFragmentsCommand::getOutputPattern(string type) {
80     try {
81         string pattern = "";
82 
83         if (type == "fasta") {  pattern = "[filename],fragclust.fasta"; }
84         else if (type == "name") {  pattern = "[filename],fragclust.names"; }
85         else if (type == "count") {  pattern = "[filename],fragclust.count_table"; }
86         else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->setControl_pressed(true);  }
87 
88         return pattern;
89     }
90     catch(exception& e) {
91         m->errorOut(e, "ClusterFragmentsCommand", "getOutputPattern");
92         exit(1);
93     }
94 }
95 //**********************************************************************************************************************
ClusterFragmentsCommand(string option)96 ClusterFragmentsCommand::ClusterFragmentsCommand(string option) : Command() {
97 	try {
98 		//allow user to run help
99 		if(option == "help") { help(); abort = true; calledHelp = true; }
100 		else if(option == "citation") { citation(); abort = true; calledHelp = true;}
101         else if(option == "category") {  abort = true; calledHelp = true;  }
102 
103 		else {
104 			OptionParser parser(option, setParameters());
105 			map<string, string> parameters = parser.getParameters();
106 
107 			ValidParameters validParameter;
108 			fastafile = validParameter.validFile(parameters, "fasta");
109 			if (fastafile == "not found") {
110 				fastafile = current->getFastaFile();
111 				if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter.\n");  }
112 				else { 	m->mothurOut("You have no current fastafile and the fasta parameter is required.\n");  abort = true; }
113 			}
114 			else if (fastafile == "not open") { fastafile = ""; abort = true; }
115 			else { current->setFastaFile(fastafile); }
116 
117 
118 			if (outputdir == ""){	outputdir = util.hasPath(fastafile); 	}
119 
120 			//check for optional parameter and set defaults
121 			// ...at some point should added some additional type checking...
122 			namefile = validParameter.validFile(parameters, "name");
123 			if (namefile == "not found") { namefile =  "";  }
124 			else if (namefile == "not open") { namefile = ""; abort = true; }
125 			else {  readNameFile(); current->setNameFile(namefile); }
126 
127             countfile = validParameter.validFile(parameters, "count");
128 			if (countfile == "not open") { abort = true; countfile = ""; }
129 			else if (countfile == "not found") { countfile = ""; }
130 			else { ct.readTable(countfile, true, false); current->setCountFile(countfile); }
131 
132             if ((countfile != "") && (namefile != "")) { m->mothurOut("When executing a cluster.fragments command you must enter ONLY ONE of the following: count or name.\n");  abort = true; }
133 
134 			string temp;
135 			temp = validParameter.valid(parameters, "diffs");		if (temp == "not found"){	temp = "0";				}
136 			util.mothurConvert(temp, diffs);
137 
138 			temp = validParameter.valid(parameters, "percent");		if (temp == "not found"){	temp = "0";				}
139 			util.mothurConvert(temp, percent);
140 
141 			if (countfile == "") {
142                 if (namefile == "") {
143                     vector<string> files; files.push_back(fastafile);
144                     if (!current->getMothurCalling())  {  parser.getNameFile(files);  }
145                 }
146             }
147 		}
148 	}
149 	catch(exception& e) {
150 		m->errorOut(e, "ClusterFragmentsCommand", "ClusterFragmentsCommand");
151 		exit(1);
152 	}
153 }
154 //**********************************************************************************************************************
execute()155 int ClusterFragmentsCommand::execute(){
156 	try {
157 
158 		if (abort) { if (calledHelp) { return 0; }  return 2;	}
159 
160 		long start = time(NULL);
161 
162 		//reads fasta file and return number of seqs
163 		int numSeqs = readFASTA(); //fills alignSeqs and makes all seqs active
164 
165 		if (m->getControl_pressed()) { return 0; }
166 
167 		if (numSeqs == 0) { m->mothurOut("Error reading fasta file...please correct.\n");  return 0;  }
168 
169 		//sort seqs by length of unaligned sequence
170 		sort(alignSeqs.begin(), alignSeqs.end(), comparePriority);
171 
172 		int count = 0;
173 
174 		//think about running through twice...
175 		for (int i = 0; i < numSeqs; i++) {
176 
177 			if (alignSeqs[i].active) {  //this sequence has not been merged yet
178 
179 				string iBases = alignSeqs[i].seq.getUnaligned();
180 
181 				//try to merge it with all smaller seqs
182 				for (int j = i+1; j < numSeqs; j++) {
183 
184 					if (m->getControl_pressed()) { return 0; }
185 
186 					if (alignSeqs[j].active) {  //this sequence has not been merged yet
187 
188 						string jBases = alignSeqs[j].seq.getUnaligned();
189 
190 						if (isFragment(iBases, jBases)) {
191                             if (countfile != "") {
192                                 ct.mergeCounts(alignSeqs[i].names, alignSeqs[j].names);
193                             }else {
194                                 //merge
195                                 alignSeqs[i].names += ',' + alignSeqs[j].names;
196                                 alignSeqs[i].numIdentical += alignSeqs[j].numIdentical;
197                             }
198 							alignSeqs[j].active = 0;
199 							alignSeqs[j].numIdentical = 0;
200 							count++;
201 						}
202 					}//end if j active
203 				}//end if i != j
204 
205 				//remove from active list
206 				alignSeqs[i].active = 0;
207 
208 			}//end if active i
209 			if(i % 100 == 0)	{ m->mothurOutJustToScreen(toString(i) + "\t" + toString(numSeqs - count) + "\t" + toString(count)+"\n"); 	}
210 		}
211 
212 		if(numSeqs % 100 != 0)	{ m->mothurOutJustToScreen(toString(numSeqs) + "\t" + toString(numSeqs - count) + "\t" + toString(count)+"\n");	}
213 
214 
215 		string fileroot = outputdir + util.getRootName(util.getSimpleName(fastafile));
216         map<string, string> variables;
217         variables["[filename]"] = fileroot;
218 		string newFastaFile = getOutputFileName("fasta", variables);
219 		string newNamesFile = getOutputFileName("name", variables);
220         if (countfile != "") { newNamesFile = getOutputFileName("count", variables); }
221 
222 		if (m->getControl_pressed()) { return 0; }
223 
224 		m->mothurOut("\nTotal number of sequences before cluster.fragments was " + toString(alignSeqs.size()) + ".\n");
225 		m->mothurOut("cluster.fragments removed " + toString(count) + " sequences.\n\n");
226 
227 		printData(newFastaFile, newNamesFile);
228 
229 		m->mothurOut("It took " + toString(time(NULL) - start) + " secs to cluster " + toString(numSeqs) + " sequences.\n");
230 
231 		if (m->getControl_pressed()) { util.mothurRemove(newFastaFile); util.mothurRemove(newNamesFile); return 0; }
232 
233 		m->mothurOut("\nOutput File Names: \n");
234 		m->mothurOut(newFastaFile); m->mothurOutEndLine();
235 		m->mothurOut(newNamesFile); m->mothurOutEndLine();
236 		outputNames.push_back(newFastaFile);  outputNames.push_back(newNamesFile); outputTypes["fasta"].push_back(newFastaFile); outputTypes["name"].push_back(newNamesFile);
237 		m->mothurOutEndLine();
238 
239 		//set fasta file as new current fastafile
240 		string currentName = "";
241 		itTypes = outputTypes.find("fasta");
242 		if (itTypes != outputTypes.end()) {
243 			if ((itTypes->second).size() != 0) { currentName = (itTypes->second)[0]; current->setFastaFile(currentName); }
244 		}
245 
246 		itTypes = outputTypes.find("name");
247 		if (itTypes != outputTypes.end()) {
248 			if ((itTypes->second).size() != 0) { currentName = (itTypes->second)[0]; current->setNameFile(currentName); }
249 		}
250 
251         itTypes = outputTypes.find("count");
252 		if (itTypes != outputTypes.end()) {
253 			if ((itTypes->second).size() != 0) { currentName = (itTypes->second)[0]; current->setCountFile(currentName); }
254 		}
255 
256 		return 0;
257 
258 	}
259 	catch(exception& e) {
260 		m->errorOut(e, "ClusterFragmentsCommand", "execute");
261 		exit(1);
262 	}
263 }
264 //***************************************************************************************************************
isFragment(string seq1,string seq2)265 bool ClusterFragmentsCommand::isFragment(string seq1, string seq2){
266 	try {
267 		bool fragment = false;
268 
269 		//exact match
270 		int pos = seq1.find(seq2);
271 		if (pos != string::npos) { return true; }
272 		//no match, no diffs wanted
273 		else if ((diffs == 0) && (percent == 0)) { return false; }
274 		else { //try aligning and see if you can find it
275 
276 			//find number of acceptable differences for this sequence fragment
277 			int totalDiffs = 0;
278 			if (diffs == 0) { //you didnt set diffs you want a percentage
279 				totalDiffs = floor((seq2.length() * (percent / 100.0)));
280 			}else if (percent == 0) { //you didn't set percent you want diffs
281 				totalDiffs = diffs;
282 			}else if ((percent != 0) && (diffs != 0)) { //you want both, set total diffs to smaller of 2
283 				totalDiffs = diffs;
284 				int percentDiff = floor((seq2.length() * (percent / 100.0)));
285 				if (percentDiff < totalDiffs) { totalDiffs = percentDiff; }
286 			}
287 
288 			Alignment* alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (seq1.length()+totalDiffs+1));
289 
290 			//use needleman to align
291 			alignment->align(seq2, seq1);
292 			string tempSeq2 = alignment->getSeqAAln();
293 			string temp = alignment->getSeqBAln();
294 
295 			delete alignment;
296 
297 			//chop gap ends
298 			int startPos = 0;
299 			int endPos = tempSeq2.length()-1;
300 			for (int i = 0; i < tempSeq2.length(); i++) {  if (isalpha(tempSeq2[i])) { startPos = i; break; } }
301 			for (int i = tempSeq2.length()-1; i >= 0; i--) {  if (isalpha(tempSeq2[i])) { endPos = i; break; } }
302 
303 			//count number of diffs
304 			int numDiffs = 0;
305 			for (int i = startPos; i <= endPos; i++) {
306 				if (tempSeq2[i] != temp[i]) { numDiffs++; }
307 			}
308 
309 			if (numDiffs <= totalDiffs) { fragment = true; }
310 
311 		}
312 
313 		return fragment;
314 
315 	}
316 	catch(exception& e) {
317 		m->errorOut(e, "ClusterFragmentsCommand", "isFragment");
318 		exit(1);
319 	}
320 }
321 /**************************************************************************************************/
readFASTA()322 int ClusterFragmentsCommand::readFASTA(){
323 	try {
324 
325 		ifstream inFasta;
326 		util.openInputFile(fastafile, inFasta);
327 
328 		while (!inFasta.eof()) {
329 
330 			if (m->getControl_pressed()) { inFasta.close(); return 0; }
331 
332 			Sequence seq(inFasta);  util.gobble(inFasta);
333 
334 			if (seq.getName() != "") {  //can get "" if commented line is at end of fasta file
335 				if (namefile != "") {
336 					itSize = sizes.find(seq.getName());
337 
338 					if (itSize == sizes.end()) { m->mothurOut(seq.getName() + " is not in your names file, please correct.\n");  exit(1); }
339 					else{
340 						seqRNode tempNode(itSize->second, seq, names[seq.getName()], seq.getUnaligned().length());
341 						alignSeqs.push_back(tempNode);
342 					}
343                 }else if(countfile != "") {
344                     seqRNode tempNode(ct.getNumSeqs(seq.getName()), seq, seq.getName(), seq.getUnaligned().length());
345                     alignSeqs.push_back(tempNode);
346 				}else { //no names file, you are identical to yourself
347 					seqRNode tempNode(1, seq, seq.getName(), seq.getUnaligned().length());
348 					alignSeqs.push_back(tempNode);
349 				}
350 			}
351 		}
352 
353 		inFasta.close();
354 		return alignSeqs.size();
355 	}
356 
357 	catch(exception& e) {
358 		m->errorOut(e, "ClusterFragmentsCommand", "readFASTA");
359 		exit(1);
360 	}
361 }
362 /**************************************************************************************************/
printData(string newfasta,string newname)363 void ClusterFragmentsCommand::printData(string newfasta, string newname){
364 	try {
365 		ofstream outFasta;
366 		ofstream outNames;
367 
368 		util.openOutputFile(newfasta, outFasta);
369 		if (countfile == "") {  util.openOutputFile(newname, outNames); }
370 
371 		for (int i = 0; i < alignSeqs.size(); i++) {
372 			if (alignSeqs[i].numIdentical != 0) {
373 				alignSeqs[i].seq.printSequence(outFasta);
374 				if (countfile == "") {  outNames << alignSeqs[i].seq.getName() << '\t' << alignSeqs[i].names << endl;  }
375 			}
376 		}
377 
378 		outFasta.close();
379 		if (countfile == "") {  outNames.close(); }
380         else { ct.printTable(newname); }
381 	}
382 	catch(exception& e) {
383 		m->errorOut(e, "ClusterFragmentsCommand", "printData");
384 		exit(1);
385 	}
386 }
387 /**************************************************************************************************/
388 
readNameFile()389 void ClusterFragmentsCommand::readNameFile(){
390 	try {
391 		ifstream in;
392 		util.openInputFile(namefile, in);
393 		string firstCol, secondCol;
394 
395 		while (!in.eof()) {
396 			in >> firstCol >> secondCol; util.gobble(in);
397 			names[firstCol] = secondCol;
398 			int size = 1;
399 
400 			for(int i=0;i<secondCol.size();i++){
401 				if(secondCol[i] == ','){	size++;	}
402 			}
403 			sizes[firstCol] = size;
404 		}
405 		in.close();
406 	}
407 	catch(exception& e) {
408 		m->errorOut(e, "ClusterFragmentsCommand", "readNameFile");
409 		exit(1);
410 	}
411 }
412 /**************************************************************************************************/
413 
414