1 /*
2 * ryanscommand.cpp
3 * Mothur
4 *
5 * Created by westcott on 9/23/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
7 *
8 */
9
10 #include "clusterfragmentscommand.h"
11 #include "needlemanoverlap.hpp"
12
13 //**********************************************************************************************************************
14 //sort by unaligned
comparePriority(seqRNode first,seqRNode second)15 inline bool comparePriority(seqRNode first, seqRNode second) {
16 bool better = false;
17
18 if (first.length > second.length) {
19 better = true;
20 }else if (first.length == second.length) {
21 if (first.numIdentical > second.numIdentical) {
22 better = true;
23 }
24 }
25
26 return better;
27 }
28 //**********************************************************************************************************************
setParameters()29 vector<string> ClusterFragmentsCommand::setParameters(){
30 try {
31 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none","fasta-name",false,true,true); parameters.push_back(pfasta);
32 CommandParameter pname("name", "InputTypes", "", "", "namecount", "none", "none","name",false,false,true); parameters.push_back(pname);
33 CommandParameter pcount("count", "InputTypes", "", "", "namecount", "none", "none","count",false,false,true); parameters.push_back(pcount);
34 CommandParameter pdiffs("diffs", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pdiffs);
35 CommandParameter ppercent("percent", "Number", "", "0", "", "", "","",false,false); parameters.push_back(ppercent);
36 CommandParameter pseed("seed", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pseed);
37 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
38 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
39
40 abort = false; calledHelp = false;
41
42 vector<string> tempOutNames;
43 outputTypes["fasta"] = tempOutNames;
44 outputTypes["name"] = tempOutNames;
45 outputTypes["count"] = tempOutNames;
46
47 vector<string> myArray;
48 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
49 return myArray;
50 }
51 catch(exception& e) {
52 m->errorOut(e, "ClusterFragmentsCommand", "setParameters");
53 exit(1);
54 }
55 }
56 //**********************************************************************************************************************
getHelpString()57 string ClusterFragmentsCommand::getHelpString(){
58 try {
59 string helpString = "";
60 helpString += "The cluster.fragments command groups sequences that are part of a larger sequence.\n";
61 helpString += "The cluster.fragments command outputs a new fasta and name or count file.\n";
62 helpString += "The cluster.fragments command parameters are fasta, name, count, diffs and percent. The fasta parameter is required, unless you have a valid current file. \n";
63 helpString += "The names parameter allows you to give a list of seqs that are identical. This file is 2 columns, first column is name or representative sequence, second column is a list of its identical sequences separated by commas.\n";
64 helpString += "The diffs parameter allows you to set the number of differences allowed, default=0. \n";
65 helpString += "The percent parameter allows you to set percentage of differences allowed, default=0. percent=2 means if the number of difference is less than or equal to two percent of the length of the fragment, then cluster.\n";
66 helpString += "You may use diffs and percent at the same time to say something like: If the number or differences is greater than 1 or more than 2% of the fragment length, don't merge. \n";
67 helpString += "The cluster.fragments command should be in the following format: \n";
68 helpString += "cluster.fragments(fasta=yourFastaFile, names=yourNamesFile) \n";
69 helpString += "Example cluster.fragments(fasta=amazon.fasta).\n";
70 ;
71 return helpString;
72 }
73 catch(exception& e) {
74 m->errorOut(e, "ClusterFragmentsCommand", "getHelpString");
75 exit(1);
76 }
77 }
78 //**********************************************************************************************************************
getOutputPattern(string type)79 string ClusterFragmentsCommand::getOutputPattern(string type) {
80 try {
81 string pattern = "";
82
83 if (type == "fasta") { pattern = "[filename],fragclust.fasta"; }
84 else if (type == "name") { pattern = "[filename],fragclust.names"; }
85 else if (type == "count") { pattern = "[filename],fragclust.count_table"; }
86 else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->setControl_pressed(true); }
87
88 return pattern;
89 }
90 catch(exception& e) {
91 m->errorOut(e, "ClusterFragmentsCommand", "getOutputPattern");
92 exit(1);
93 }
94 }
95 //**********************************************************************************************************************
ClusterFragmentsCommand(string option)96 ClusterFragmentsCommand::ClusterFragmentsCommand(string option) : Command() {
97 try {
98 //allow user to run help
99 if(option == "help") { help(); abort = true; calledHelp = true; }
100 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
101 else if(option == "category") { abort = true; calledHelp = true; }
102
103 else {
104 OptionParser parser(option, setParameters());
105 map<string, string> parameters = parser.getParameters();
106
107 ValidParameters validParameter;
108 fastafile = validParameter.validFile(parameters, "fasta");
109 if (fastafile == "not found") {
110 fastafile = current->getFastaFile();
111 if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter.\n"); }
112 else { m->mothurOut("You have no current fastafile and the fasta parameter is required.\n"); abort = true; }
113 }
114 else if (fastafile == "not open") { fastafile = ""; abort = true; }
115 else { current->setFastaFile(fastafile); }
116
117
118 if (outputdir == ""){ outputdir = util.hasPath(fastafile); }
119
120 //check for optional parameter and set defaults
121 // ...at some point should added some additional type checking...
122 namefile = validParameter.validFile(parameters, "name");
123 if (namefile == "not found") { namefile = ""; }
124 else if (namefile == "not open") { namefile = ""; abort = true; }
125 else { readNameFile(); current->setNameFile(namefile); }
126
127 countfile = validParameter.validFile(parameters, "count");
128 if (countfile == "not open") { abort = true; countfile = ""; }
129 else if (countfile == "not found") { countfile = ""; }
130 else { ct.readTable(countfile, true, false); current->setCountFile(countfile); }
131
132 if ((countfile != "") && (namefile != "")) { m->mothurOut("When executing a cluster.fragments command you must enter ONLY ONE of the following: count or name.\n"); abort = true; }
133
134 string temp;
135 temp = validParameter.valid(parameters, "diffs"); if (temp == "not found"){ temp = "0"; }
136 util.mothurConvert(temp, diffs);
137
138 temp = validParameter.valid(parameters, "percent"); if (temp == "not found"){ temp = "0"; }
139 util.mothurConvert(temp, percent);
140
141 if (countfile == "") {
142 if (namefile == "") {
143 vector<string> files; files.push_back(fastafile);
144 if (!current->getMothurCalling()) { parser.getNameFile(files); }
145 }
146 }
147 }
148 }
149 catch(exception& e) {
150 m->errorOut(e, "ClusterFragmentsCommand", "ClusterFragmentsCommand");
151 exit(1);
152 }
153 }
154 //**********************************************************************************************************************
execute()155 int ClusterFragmentsCommand::execute(){
156 try {
157
158 if (abort) { if (calledHelp) { return 0; } return 2; }
159
160 long start = time(NULL);
161
162 //reads fasta file and return number of seqs
163 int numSeqs = readFASTA(); //fills alignSeqs and makes all seqs active
164
165 if (m->getControl_pressed()) { return 0; }
166
167 if (numSeqs == 0) { m->mothurOut("Error reading fasta file...please correct.\n"); return 0; }
168
169 //sort seqs by length of unaligned sequence
170 sort(alignSeqs.begin(), alignSeqs.end(), comparePriority);
171
172 int count = 0;
173
174 //think about running through twice...
175 for (int i = 0; i < numSeqs; i++) {
176
177 if (alignSeqs[i].active) { //this sequence has not been merged yet
178
179 string iBases = alignSeqs[i].seq.getUnaligned();
180
181 //try to merge it with all smaller seqs
182 for (int j = i+1; j < numSeqs; j++) {
183
184 if (m->getControl_pressed()) { return 0; }
185
186 if (alignSeqs[j].active) { //this sequence has not been merged yet
187
188 string jBases = alignSeqs[j].seq.getUnaligned();
189
190 if (isFragment(iBases, jBases)) {
191 if (countfile != "") {
192 ct.mergeCounts(alignSeqs[i].names, alignSeqs[j].names);
193 }else {
194 //merge
195 alignSeqs[i].names += ',' + alignSeqs[j].names;
196 alignSeqs[i].numIdentical += alignSeqs[j].numIdentical;
197 }
198 alignSeqs[j].active = 0;
199 alignSeqs[j].numIdentical = 0;
200 count++;
201 }
202 }//end if j active
203 }//end if i != j
204
205 //remove from active list
206 alignSeqs[i].active = 0;
207
208 }//end if active i
209 if(i % 100 == 0) { m->mothurOutJustToScreen(toString(i) + "\t" + toString(numSeqs - count) + "\t" + toString(count)+"\n"); }
210 }
211
212 if(numSeqs % 100 != 0) { m->mothurOutJustToScreen(toString(numSeqs) + "\t" + toString(numSeqs - count) + "\t" + toString(count)+"\n"); }
213
214
215 string fileroot = outputdir + util.getRootName(util.getSimpleName(fastafile));
216 map<string, string> variables;
217 variables["[filename]"] = fileroot;
218 string newFastaFile = getOutputFileName("fasta", variables);
219 string newNamesFile = getOutputFileName("name", variables);
220 if (countfile != "") { newNamesFile = getOutputFileName("count", variables); }
221
222 if (m->getControl_pressed()) { return 0; }
223
224 m->mothurOut("\nTotal number of sequences before cluster.fragments was " + toString(alignSeqs.size()) + ".\n");
225 m->mothurOut("cluster.fragments removed " + toString(count) + " sequences.\n\n");
226
227 printData(newFastaFile, newNamesFile);
228
229 m->mothurOut("It took " + toString(time(NULL) - start) + " secs to cluster " + toString(numSeqs) + " sequences.\n");
230
231 if (m->getControl_pressed()) { util.mothurRemove(newFastaFile); util.mothurRemove(newNamesFile); return 0; }
232
233 m->mothurOut("\nOutput File Names: \n");
234 m->mothurOut(newFastaFile); m->mothurOutEndLine();
235 m->mothurOut(newNamesFile); m->mothurOutEndLine();
236 outputNames.push_back(newFastaFile); outputNames.push_back(newNamesFile); outputTypes["fasta"].push_back(newFastaFile); outputTypes["name"].push_back(newNamesFile);
237 m->mothurOutEndLine();
238
239 //set fasta file as new current fastafile
240 string currentName = "";
241 itTypes = outputTypes.find("fasta");
242 if (itTypes != outputTypes.end()) {
243 if ((itTypes->second).size() != 0) { currentName = (itTypes->second)[0]; current->setFastaFile(currentName); }
244 }
245
246 itTypes = outputTypes.find("name");
247 if (itTypes != outputTypes.end()) {
248 if ((itTypes->second).size() != 0) { currentName = (itTypes->second)[0]; current->setNameFile(currentName); }
249 }
250
251 itTypes = outputTypes.find("count");
252 if (itTypes != outputTypes.end()) {
253 if ((itTypes->second).size() != 0) { currentName = (itTypes->second)[0]; current->setCountFile(currentName); }
254 }
255
256 return 0;
257
258 }
259 catch(exception& e) {
260 m->errorOut(e, "ClusterFragmentsCommand", "execute");
261 exit(1);
262 }
263 }
264 //***************************************************************************************************************
isFragment(string seq1,string seq2)265 bool ClusterFragmentsCommand::isFragment(string seq1, string seq2){
266 try {
267 bool fragment = false;
268
269 //exact match
270 int pos = seq1.find(seq2);
271 if (pos != string::npos) { return true; }
272 //no match, no diffs wanted
273 else if ((diffs == 0) && (percent == 0)) { return false; }
274 else { //try aligning and see if you can find it
275
276 //find number of acceptable differences for this sequence fragment
277 int totalDiffs = 0;
278 if (diffs == 0) { //you didnt set diffs you want a percentage
279 totalDiffs = floor((seq2.length() * (percent / 100.0)));
280 }else if (percent == 0) { //you didn't set percent you want diffs
281 totalDiffs = diffs;
282 }else if ((percent != 0) && (diffs != 0)) { //you want both, set total diffs to smaller of 2
283 totalDiffs = diffs;
284 int percentDiff = floor((seq2.length() * (percent / 100.0)));
285 if (percentDiff < totalDiffs) { totalDiffs = percentDiff; }
286 }
287
288 Alignment* alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (seq1.length()+totalDiffs+1));
289
290 //use needleman to align
291 alignment->align(seq2, seq1);
292 string tempSeq2 = alignment->getSeqAAln();
293 string temp = alignment->getSeqBAln();
294
295 delete alignment;
296
297 //chop gap ends
298 int startPos = 0;
299 int endPos = tempSeq2.length()-1;
300 for (int i = 0; i < tempSeq2.length(); i++) { if (isalpha(tempSeq2[i])) { startPos = i; break; } }
301 for (int i = tempSeq2.length()-1; i >= 0; i--) { if (isalpha(tempSeq2[i])) { endPos = i; break; } }
302
303 //count number of diffs
304 int numDiffs = 0;
305 for (int i = startPos; i <= endPos; i++) {
306 if (tempSeq2[i] != temp[i]) { numDiffs++; }
307 }
308
309 if (numDiffs <= totalDiffs) { fragment = true; }
310
311 }
312
313 return fragment;
314
315 }
316 catch(exception& e) {
317 m->errorOut(e, "ClusterFragmentsCommand", "isFragment");
318 exit(1);
319 }
320 }
321 /**************************************************************************************************/
readFASTA()322 int ClusterFragmentsCommand::readFASTA(){
323 try {
324
325 ifstream inFasta;
326 util.openInputFile(fastafile, inFasta);
327
328 while (!inFasta.eof()) {
329
330 if (m->getControl_pressed()) { inFasta.close(); return 0; }
331
332 Sequence seq(inFasta); util.gobble(inFasta);
333
334 if (seq.getName() != "") { //can get "" if commented line is at end of fasta file
335 if (namefile != "") {
336 itSize = sizes.find(seq.getName());
337
338 if (itSize == sizes.end()) { m->mothurOut(seq.getName() + " is not in your names file, please correct.\n"); exit(1); }
339 else{
340 seqRNode tempNode(itSize->second, seq, names[seq.getName()], seq.getUnaligned().length());
341 alignSeqs.push_back(tempNode);
342 }
343 }else if(countfile != "") {
344 seqRNode tempNode(ct.getNumSeqs(seq.getName()), seq, seq.getName(), seq.getUnaligned().length());
345 alignSeqs.push_back(tempNode);
346 }else { //no names file, you are identical to yourself
347 seqRNode tempNode(1, seq, seq.getName(), seq.getUnaligned().length());
348 alignSeqs.push_back(tempNode);
349 }
350 }
351 }
352
353 inFasta.close();
354 return alignSeqs.size();
355 }
356
357 catch(exception& e) {
358 m->errorOut(e, "ClusterFragmentsCommand", "readFASTA");
359 exit(1);
360 }
361 }
362 /**************************************************************************************************/
printData(string newfasta,string newname)363 void ClusterFragmentsCommand::printData(string newfasta, string newname){
364 try {
365 ofstream outFasta;
366 ofstream outNames;
367
368 util.openOutputFile(newfasta, outFasta);
369 if (countfile == "") { util.openOutputFile(newname, outNames); }
370
371 for (int i = 0; i < alignSeqs.size(); i++) {
372 if (alignSeqs[i].numIdentical != 0) {
373 alignSeqs[i].seq.printSequence(outFasta);
374 if (countfile == "") { outNames << alignSeqs[i].seq.getName() << '\t' << alignSeqs[i].names << endl; }
375 }
376 }
377
378 outFasta.close();
379 if (countfile == "") { outNames.close(); }
380 else { ct.printTable(newname); }
381 }
382 catch(exception& e) {
383 m->errorOut(e, "ClusterFragmentsCommand", "printData");
384 exit(1);
385 }
386 }
387 /**************************************************************************************************/
388
readNameFile()389 void ClusterFragmentsCommand::readNameFile(){
390 try {
391 ifstream in;
392 util.openInputFile(namefile, in);
393 string firstCol, secondCol;
394
395 while (!in.eof()) {
396 in >> firstCol >> secondCol; util.gobble(in);
397 names[firstCol] = secondCol;
398 int size = 1;
399
400 for(int i=0;i<secondCol.size();i++){
401 if(secondCol[i] == ','){ size++; }
402 }
403 sizes[firstCol] = size;
404 }
405 in.close();
406 }
407 catch(exception& e) {
408 m->errorOut(e, "ClusterFragmentsCommand", "readNameFile");
409 exit(1);
410 }
411 }
412 /**************************************************************************************************/
413
414