1 /*
2 * distancecommand.cpp
3 * Mothur
4 *
5 * Created by Sarah Westcott on 5/7/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7 *
8 */
9
10 #include "distancecommand.h"
11
12 //**********************************************************************************************************************
setParameters()13 vector<string> DistanceCommand::setParameters(){
14 try {
15 CommandParameter pcolumn("column", "InputTypes", "", "", "none", "none", "OldFastaColumn","column",false,false); parameters.push_back(pcolumn);
16 CommandParameter poldfasta("oldfasta", "InputTypes", "", "", "none", "none", "OldFastaColumn","",false,false); parameters.push_back(poldfasta);
17 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none","phylip-column",false,true, true); parameters.push_back(pfasta);
18 CommandParameter poutput("output", "Multiple", "column-lt-square", "column", "", "", "","phylip-column",false,false, true); parameters.push_back(poutput);
19 CommandParameter pcalc("calc", "Multiple", "nogaps-eachgap-onegap-jtt-pmb-pam-kimura", "onegap", "", "", "","",false,false); parameters.push_back(pcalc);
20 CommandParameter pcountends("countends", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pcountends);
21 CommandParameter pfitcalc("fitcalc", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pfitcalc);
22 CommandParameter pcompress("compress", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pcompress);
23 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false, true); parameters.push_back(pprocessors);
24 CommandParameter pcutoff("cutoff", "Number", "", "1.0", "", "", "","",false,false, true); parameters.push_back(pcutoff);
25 CommandParameter pseed("seed", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pseed);
26 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
27 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
28
29 abort = false; calledHelp = false;
30
31 vector<string> tempOutNames;
32 outputTypes["phylip"] = tempOutNames;
33 outputTypes["column"] = tempOutNames;
34
35 vector<string> myArray;
36 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
37 return myArray;
38 }
39 catch(exception& e) {
40 m->errorOut(e, "DistanceCommand", "setParameters");
41 exit(1);
42 }
43 }
44 //**********************************************************************************************************************
getHelpString()45 string DistanceCommand::getHelpString(){
46 try {
47 string helpString = "";
48 helpString += "The dist.seqs command reads a file containing sequences and creates a distance file.\n";
49 helpString += "The dist.seqs command parameters are fasta, oldfasta, column, calc, countends, output, compress, cutoff and processors. \n";
50 helpString += "The fasta parameter is required, unless you have a valid current fasta file.\n";
51 helpString += "The oldfasta and column parameters allow you to append the distances calculated to the column file.\n";
52 helpString += "The calc parameter allows you to specify the method of calculating the distances. Your options are: nogaps, onegap or eachgap for dna/rna sequences. If using protein sequences, your calc options are jtt, pmb, pam and kimura. The default is onegap.\n";
53 helpString += "The countends parameter allows you to specify whether to include terminal gaps in distance. Your options are: T or F. The default is T.\n";
54 helpString += "The cutoff parameter allows you to specify maximum distance to keep. The default is 1.0.\n";
55 helpString += "The output parameter allows you to specify format of your distance matrix. Options are column, lt, and square. The default is column.\n";
56 helpString += "The processors parameter allows you to specify number of processors to use. The default is 1.\n";
57 helpString += "The compress parameter allows you to indicate that you want the resulting distance file compressed. The default is false.\n";
58 helpString += "The dist.seqs command should be in the following format: \n";
59 helpString += "dist.seqs(fasta=yourFastaFile, calc=yourCalc, countends=yourEnds, cutoff= yourCutOff, processors=yourProcessors) \n";
60 helpString += "Example dist.seqs(fasta=amazon.fasta, calc=eachgap, countends=F, cutoff= 2.0, processors=3).\n";
61 return helpString;
62 }
63 catch(exception& e) {
64 m->errorOut(e, "DistanceCommand", "getHelpString");
65 exit(1);
66 }
67 }
68 //**********************************************************************************************************************
getOutputPattern(string type)69 string DistanceCommand::getOutputPattern(string type) {
70 try {
71 string pattern = "";
72
73 if (type == "phylip") { pattern = "[filename],[outputtag],dist"; }
74 else if (type == "column") { pattern = "[filename],dist-[filename],[outputtag],dist"; }
75 else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->setControl_pressed(true); }
76
77 return pattern;
78 }
79 catch(exception& e) {
80 m->errorOut(e, "DistanceCommand", "getOutputPattern");
81 exit(1);
82 }
83 }
84 //**********************************************************************************************************************
DistanceCommand(string option)85 DistanceCommand::DistanceCommand(string option) : Command() {
86 try {
87 //allow user to run help
88 if(option == "help") { help(); abort = true; calledHelp = true; }
89 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
90 else if(option == "category") { abort = true; calledHelp = true; }
91
92 else {
93 OptionParser parser(option, setParameters());
94 map<string, string> parameters = parser.getParameters();
95
96 ValidParameters validParameter;
97 fastafile = validParameter.validFile(parameters, "fasta");
98 if (fastafile == "not found") {
99 fastafile = current->getFastaFile();
100 if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter.\n"); }
101 else { m->mothurOut("You have no current fastafile and the fasta parameter is required.\n"); abort = true; }
102 }else if (fastafile == "not open") { abort = true; }
103 else{ current->setFastaFile(fastafile); }
104
105 oldfastafile = validParameter.validFile(parameters, "oldfasta");
106 if (oldfastafile == "not found") { oldfastafile = ""; }
107 else if (oldfastafile == "not open") { abort = true; }
108
109 column = validParameter.validFile(parameters, "column");
110 if (column == "not found") { column = ""; }
111 else if (column == "not open") { abort = true; }
112 else { current->setColumnFile(column); }
113
114 if (outputdir == ""){ outputdir += util.hasPath(fastafile); }
115
116 //check for optional parameter and set defaults
117 // ...at some point should added some additional type checking...
118 calc = validParameter.valid(parameters, "calc");
119 if (calc == "not found") { calc = "onegap"; }
120 else {
121 if (calc == "default") { calc = "onegap"; }
122 }
123
124 string temp;
125 temp = validParameter.valid(parameters, "countends"); if(temp == "not found"){ temp = "T"; }
126 countends = util.isTrue(temp);
127
128 temp = validParameter.valid(parameters, "fitcalc"); if(temp == "not found"){ temp = "F"; }
129 fitCalc = util.isTrue(temp);
130
131 temp = validParameter.valid(parameters, "cutoff"); if(temp == "not found"){ temp = "1.0"; }
132 util.mothurConvert(temp, cutoff);
133
134 temp = validParameter.valid(parameters, "processors"); if (temp == "not found"){ temp = current->getProcessors(); }
135 processors = current->setProcessors(temp);
136
137 temp = validParameter.valid(parameters, "compress"); if(temp == "not found"){ temp = "F"; }
138 compress = util.isTrue(temp);
139
140 output = validParameter.valid(parameters, "output"); if(output == "not found"){ output = "column"; }
141 if (output == "phylip") { output = "lt"; }
142
143 if (((column != "") && (oldfastafile == "")) || ((column == "") && (oldfastafile != ""))) { m->mothurOut("If you provide column or oldfasta, you must provide both.\n"); abort=true; }
144
145 if ((column != "") && (oldfastafile != "") && (output != "column")) { m->mothurOut("You have provided column and oldfasta, indicating you want to append distances to your column file. Your output must be in column format to do so.\n"); abort=true; }
146
147 if ((output != "column") && (output != "lt") && (output != "square")) { m->mothurOut(output + " is not a valid output form. Options are column, lt and square. I will use column.\n"); output = "column"; }
148
149 if ((calc != "onegap") && (calc != "eachgap") && (calc != "nogaps") && (calc != "jtt") && (calc != "pmb") && (calc != "pam") && (calc != "kimura")) { m->mothurOut(calc + " is not a valid calc. Options are eachgap, onegap, nogaps, jtt, pmb, pam and kimura. I'll use onegap.\n"); calc = "onegap"; }
150
151 prot = false; //not using protein sequences
152 if ((calc == "jtt") || (calc == "pmb") || (calc == "pam") || (calc == "kimura")) { prot = true; }
153
154 }
155 }
156 catch(exception& e) {
157 m->errorOut(e, "DistanceCommand", "DistanceCommand");
158 exit(1);
159 }
160 }
161 //**********************************************************************************************************************
DistanceCommand(StorageDatabase * & storageDB,string outputFileRoot,double cut,string outputformat,int proc)162 DistanceCommand::DistanceCommand(StorageDatabase*& storageDB, string outputFileRoot, double cut, string outputformat, int proc) : Command() {
163 try {
164 abort = false; calledHelp = false;
165 vector<string> tempOutNames;
166 outputTypes["phylip"] = tempOutNames;
167 outputTypes["column"] = tempOutNames;
168
169 calc = "onegap";
170 countends = true;
171 fitCalc = false;
172 cutoff = cut;
173 processors = proc;
174 compress = false;
175 output = outputformat;
176 prot = false; //not using protein sequences
177
178 numDistsBelowCutoff = 0;
179 db = storageDB;
180 numNewFasta = db->getNumSeqs();
181 numSeqs = db->getNumSeqs();
182
183 if (!db->sameLength()) { m->mothurOut("[ERROR]: your sequences are not the same length, aborting.\n"); return; }
184 if (numSeqs < 2) { m->mothurOut("[ERROR]: you must have at least 2 sequences to calculate the distances, aborting.\n"); return; }
185
186 string outputFile;
187 map<string, string> variables;
188 variables["[filename]"] = outputFileRoot;
189
190 if (output == "lt") { //does the user want lower triangle phylip formatted file
191 variables["[outputtag]"] = "phylip";
192 outputFile = getOutputFileName("phylip", variables);
193 util.mothurRemove(outputFile); outputTypes["phylip"].push_back(outputFile);
194
195 //output numSeqs to phylip formatted dist file
196 }else if (output == "column") { //user wants column format
197
198 outputFile = getOutputFileName("column", variables);
199 outputTypes["column"].push_back(outputFile);
200 util.mothurRemove(outputFile);
201 }
202
203
204 m->mothurOut("\nSequence\tTime\tNum_Dists_Below_Cutoff\n");
205
206 createProcesses(outputFile);
207
208 m->mothurOut("\nOutput File Names:\n"); m->mothurOut(outputFile+"\n\n");
209
210 }
211 catch(exception& e) {
212 m->errorOut(e, "DistanceCommand", "DistanceCommand");
213 exit(1);
214 }
215 }
216 //**********************************************************************************************************************
217
execute()218 int DistanceCommand::execute(){
219 try {
220
221 if (abort) { if (calledHelp) { return 0; } return 2; }
222
223 numDistsBelowCutoff = 0;
224
225 ifstream inFASTA; util.openInputFile(fastafile, inFASTA);
226 if (prot) { db = new ProteinDB(inFASTA); }
227 else { db = new SequenceDB(inFASTA); }
228 inFASTA.close();
229
230 //save number of new sequence
231 numNewFasta = db->getNumSeqs();
232
233 //sanity check the oldfasta and column file as well as add oldfasta sequences to db
234 if ((oldfastafile != "") && (column != "")) { if (!(sanityCheck())) { return 0; } }
235
236 if (m->getControl_pressed()) { delete db; return 0; }
237
238 numSeqs = db->getNumSeqs();
239
240 if (!db->sameLength()) { m->mothurOut("[ERROR]: your sequences are not the same length, aborting.\n"); return 0; }
241 if (numSeqs < 2) { m->mothurOut("[ERROR]: you must have at least 2 sequences to calculate the distances, aborting.\n"); return 0; }
242
243
244 string outputFile;
245 map<string, string> variables;
246 variables["[filename]"] = outputdir + util.getRootName(util.getSimpleName(fastafile));
247 if ((oldfastafile != "") && (column != "")) { variables["[filename]"] = outputdir + util.getRootName(util.getSimpleName(oldfastafile)); }
248
249 if (output == "lt") { //does the user want lower triangle phylip formatted file
250 variables["[outputtag]"] = "phylip";
251 outputFile = getOutputFileName("phylip", variables);
252 util.mothurRemove(outputFile); outputTypes["phylip"].push_back(outputFile);
253
254 //output numSeqs to phylip formatted dist file
255 }else if (output == "column") { //user wants column format
256 if (fitCalc) { variables["[outputtag]"] = "fit"; }
257 outputFile = getOutputFileName("column", variables);
258 outputTypes["column"].push_back(outputFile);
259
260 //so we don't accidentally overwrite
261 if (outputFile == column) {
262 string tempcolumn = column + ".old";
263 rename(column.c_str(), tempcolumn.c_str());
264 }
265
266 util.mothurRemove(outputFile);
267 }else { //assume square
268 variables["[outputtag]"] = "square";
269 outputFile = getOutputFileName("phylip", variables);
270 util.mothurRemove(outputFile);
271 outputTypes["phylip"].push_back(outputFile);
272 }
273 m->mothurOut("\nSequence\tTime\tNum_Dists_Below_Cutoff\n");
274
275 createProcesses(outputFile);
276
277 if (m->getControl_pressed()) { outputTypes.clear(); util.mothurRemove(outputFile); return 0; }
278
279 ifstream fileHandle; fileHandle.open(outputFile.c_str());
280 if(fileHandle) {
281 util.gobble(fileHandle);
282 if (fileHandle.eof()) { m->mothurOut(outputFile + " is blank. This can result if there are no distances below your cutoff.\n"); }
283 }
284
285 //append the old column file to the new one
286 if ((oldfastafile != "") && (column != "")) {
287 //we had to rename the column file so we didnt overwrite above, but we want to keep old name
288 if (outputFile == column) {
289 string tempcolumn = column + ".old";
290 util.appendFiles(tempcolumn, outputFile);
291 util.mothurRemove(tempcolumn);
292 }else{
293 if (!fitCalc) {
294 util.appendFiles(outputFile, column);
295 util.mothurRemove(outputFile);
296 outputFile = column;
297 }
298 }
299 outputTypes["column"].clear(); outputTypes["column"].push_back(outputFile);
300 }
301
302 if (m->getControl_pressed()) { outputTypes.clear(); util.mothurRemove(outputFile); return 0; }
303
304 //set phylip file as new current phylipfile
305 string currentName = "";
306 itTypes = outputTypes.find("phylip");
307 if (itTypes != outputTypes.end()) {
308 if ((itTypes->second).size() != 0) { currentName = (itTypes->second)[0]; current->setPhylipFile(currentName); }
309 }
310
311 //set column file as new current columnfile
312 itTypes = outputTypes.find("column");
313 if (itTypes != outputTypes.end()) {
314 if ((itTypes->second).size() != 0) { currentName = (itTypes->second)[0]; current->setColumnFile(currentName); }
315 }
316
317 m->mothurOut("\nOutput File Names: \n");
318 m->mothurOut(outputFile+"\n\n");
319
320 if (compress) {
321 m->mothurOut("Compressing...\n");
322 m->mothurOut("(Replacing " + outputFile + " with " + outputFile + ".gz)\n");
323 system(("gzip -v " + outputFile).c_str());
324 outputNames.push_back(outputFile + ".gz");
325 }else { outputNames.push_back(outputFile); }
326
327 return 0;
328
329 }
330 catch(exception& e) {
331 m->errorOut(e, "DistanceCommand", "execute");
332 exit(1);
333 }
334 }
335 /**************************************************************************************************/
driverColumn(distanceData * params)336 void driverColumn(distanceData* params){
337 try {
338 ValidCalculators validCalculator;
339 DistCalc* distCalculator;
340
341 if (!params->prot) {
342 if (params->countends) {
343 if (validCalculator.isValidCalculator("distance", params->calc) ) {
344 if (params->calc == "nogaps") { distCalculator = new ignoreGaps(params->cutoff); }
345 else if (params->calc == "eachgap") { distCalculator = new eachGapDist(params->cutoff); }
346 else if (params->calc == "onegap") { distCalculator = new oneGapDist(params->cutoff); }
347 }
348 }else {
349 if (validCalculator.isValidCalculator("distance", params->calc) ) {
350 if (params->calc == "nogaps") { distCalculator = new ignoreGaps(params->cutoff); }
351 else if (params->calc == "eachgap"){ distCalculator = new eachGapIgnoreTermGapDist(params->cutoff); }
352 else if (params->calc == "onegap") { distCalculator = new oneGapIgnoreTermGapDist(params->cutoff); }
353 }
354 }
355 }else {
356 if (validCalculator.isValidCalculator("protdist", params->calc) ) {
357 if (params->calc == "jtt") { distCalculator = new JTT(params->cutoff); }
358 else if (params->calc == "pmb") { distCalculator = new PMB(params->cutoff); }
359 else if (params->calc == "pam") { distCalculator = new PAM(params->cutoff); }
360 else if (params->calc == "kimura") { distCalculator = new Kimura(params->cutoff); }
361 }
362 }
363
364
365 int startTime = time(NULL);
366
367 params->count = 0;
368 string buffer = "";
369
370 for(int i=params->startLine;i<params->endLine;i++){
371
372 Sequence seqI; Protein seqIP; string nameI = "";
373 if (params->prot) { seqIP = params->db->getProt(i); nameI = seqIP.getName(); }
374 else { seqI = params->db->getSeq(i); nameI = seqI.getName(); }
375
376 for(int j=0;j<i;j++){
377
378 if (params->m->getControl_pressed()) { break; }
379
380 if ((i >= params->numNewFasta) && (j >= params->numNewFasta)) { break; }
381
382 double dist = 1.0; string nameJ = "";
383 if (params->prot) { Protein seqJP = params->db->getProt(j); nameJ = seqJP.getName(); dist = distCalculator->calcDist(seqIP, seqJP); }
384 else { Sequence seqJ = params->db->getSeq(j); nameJ = seqJ.getName(); dist = distCalculator->calcDist(seqI, seqJ); }
385
386
387 if(dist <= params->cutoff){
388 buffer += (nameI + " " + nameJ + " " + toString(dist) + "\n");
389 params->count++;
390 }
391 }
392
393 if(i % 100 == 0){ params->threadWriter->write(buffer); buffer = ""; params->m->mothurOutJustToScreen(toString(i) + "\t" + toString(time(NULL) - startTime) + "\t" + toString(params->count) +"\n"); }
394
395 }
396 params->threadWriter->write(buffer);
397
398 if((params->endLine-1) % 100 != 0){ params->m->mothurOutJustToScreen(toString(params->endLine-1) + "\t" + toString(time(NULL) - startTime) + "\t" + toString(params->count) +"\n"); }
399
400 delete distCalculator;
401 }
402 catch(exception& e) {
403 params->m->errorOut(e, "DistanceCommand", "driverColumn");
404 exit(1);
405 }
406 }
407 /**************************************************************************************************/
driverLt(distanceData * params)408 void driverLt(distanceData* params){
409 try {
410 ValidCalculators validCalculator;
411 DistCalc* distCalculator;
412 double cutoff = 1.0;
413
414 if (!params->prot) {
415 if (params->countends) {
416 if (validCalculator.isValidCalculator("distance", params->calc) ) {
417 if (params->calc == "nogaps") { distCalculator = new ignoreGaps(cutoff); }
418 else if (params->calc == "eachgap") { distCalculator = new eachGapDist(cutoff); }
419 else if (params->calc == "onegap") { distCalculator = new oneGapDist(cutoff); }
420 }
421 }else {
422 if (validCalculator.isValidCalculator("distance", params->calc) ) {
423 if (params->calc == "nogaps") { distCalculator = new ignoreGaps(cutoff); }
424 else if (params->calc == "eachgap"){ distCalculator = new eachGapIgnoreTermGapDist(cutoff); }
425 else if (params->calc == "onegap") { distCalculator = new oneGapIgnoreTermGapDist(cutoff); }
426 }
427 }
428 }else {
429 if (validCalculator.isValidCalculator("protdist", params->calc) ) {
430 if (params->calc == "jtt") { distCalculator = new JTT(params->cutoff); }
431 else if (params->calc == "pmb") { distCalculator = new PMB(params->cutoff); }
432 else if (params->calc == "pam") { distCalculator = new PAM(params->cutoff); }
433 else if (params->calc == "kimura") { distCalculator = new Kimura(params->cutoff); }
434 }
435 }
436
437 int startTime = time(NULL);
438 long long numSeqs = params->db->getNumSeqs();
439
440 //column file
441 ofstream outFile; params->util.openOutputFile(params->outputFileName, outFile);
442 outFile.setf(ios::fixed, ios::showpoint); outFile << setprecision(4);
443
444 if(params->startLine == 0){ outFile << numSeqs << endl; }
445
446 params->count = 0;
447 for(int i=params->startLine;i<params->endLine;i++){
448
449 Sequence seqI; Protein seqIP; string nameI = "";
450 if (params->prot) { seqIP = params->db->getProt(i); nameI = seqIP.getName(); }
451 else { seqI = params->db->getSeq(i); nameI = seqI.getName(); }
452
453 if (nameI.length() < 10) { while (nameI.length() < 10) { nameI += " "; } }
454 outFile << nameI;
455
456 for(int j=0;j<i;j++){
457
458 if (params->m->getControl_pressed()) { break; }
459
460 if ((i >= params->numNewFasta) && (j >= params->numNewFasta)) { break; }
461
462 double dist = 1.0;
463 if (params->prot) { Protein seqJP = params->db->getProt(j); dist = distCalculator->calcDist(seqIP, seqJP); }
464 else { Sequence seqJ = params->db->getSeq(j); dist = distCalculator->calcDist(seqI, seqJ); }
465
466 if(dist <= params->cutoff){ params->count++; }
467 outFile << '\t' << dist;
468 }
469
470 outFile << endl;
471
472 if(i % 100 == 0){ params->m->mothurOutJustToScreen(toString(i) + "\t" + toString(time(NULL) - startTime) + "\t" + toString(params->count) +"\n"); }
473
474 }
475
476 if((params->endLine-1) % 100 != 0){ params->m->mothurOutJustToScreen(toString(params->endLine-1) + "\t" + toString(time(NULL) - startTime) + "\t" + toString(params->count) +"\n"); }
477
478 outFile.close();
479 delete distCalculator;
480 }
481 catch(exception& e) {
482 params->m->errorOut(e, "DistanceCommand", "driverLt");
483 exit(1);
484 }
485 }
486 /**************************************************************************************************/
driverSquare(distanceData * params)487 void driverSquare(distanceData* params){
488 try {
489 ValidCalculators validCalculator;
490 DistCalc* distCalculator;
491 double cutoff = 1.0;
492
493 if (!params->prot) {
494 if (params->countends) {
495 if (validCalculator.isValidCalculator("distance", params->calc) ) {
496 if (params->calc == "nogaps") { distCalculator = new ignoreGaps(cutoff); }
497 else if (params->calc == "eachgap") { distCalculator = new eachGapDist(cutoff); }
498 else if (params->calc == "onegap") { distCalculator = new oneGapDist(cutoff); }
499 }
500 }else {
501 if (validCalculator.isValidCalculator("distance", params->calc) ) {
502 if (params->calc == "nogaps") { distCalculator = new ignoreGaps(cutoff); }
503 else if (params->calc == "eachgap"){ distCalculator = new eachGapIgnoreTermGapDist(cutoff); }
504 else if (params->calc == "onegap") { distCalculator = new oneGapIgnoreTermGapDist(cutoff); }
505 }
506 }
507 }else {
508 if (validCalculator.isValidCalculator("protdist", params->calc) ) {
509 if (params->calc == "jtt") { distCalculator = new JTT(params->cutoff); }
510 else if (params->calc == "pmb") { distCalculator = new PMB(params->cutoff); }
511 else if (params->calc == "pam") { distCalculator = new PAM(params->cutoff); }
512 else if (params->calc == "kimura") { distCalculator = new Kimura(params->cutoff); }
513 }
514 }
515
516 int startTime = time(NULL);
517
518 //column file
519 ofstream outFile;
520 params->util.openOutputFile(params->outputFileName, outFile);
521 outFile.setf(ios::fixed, ios::showpoint); outFile << setprecision(4);
522
523 long long numSeqs = params->db->getNumSeqs();
524 if(params->startLine == 0){ outFile << numSeqs << endl; }
525
526 params->count = 0;
527 for(int i=params->startLine;i<params->endLine;i++){
528
529 Sequence seqI; Protein seqIP; string nameI = "";
530 if (params->prot) { seqIP = params->db->getProt(i); nameI = seqIP.getName(); }
531 else { seqI = params->db->getSeq(i); nameI = seqI.getName(); }
532
533 if (nameI.length() < 10) { while (nameI.length() < 10) { nameI += " "; } }
534 outFile << nameI << '\t';
535
536 for(int j=0;j<numSeqs;j++){
537
538 if (params->m->getControl_pressed()) { break; }
539
540 double dist = 1.0;
541 if (i == j) { dist = 0.0000; }
542 else {
543 if (params->prot) { Protein seqJP = params->db->getProt(j); dist = distCalculator->calcDist(seqIP, seqJP); }
544 else { Sequence seqJ = params->db->getSeq(j); dist = distCalculator->calcDist(seqI, seqJ); }
545 }
546
547 if(dist <= params->cutoff){ params->count++; }
548
549 outFile << dist << '\t';
550 }
551
552 outFile << endl;
553
554 if(i % 100 == 0){ params->m->mothurOutJustToScreen(toString(i) + "\t" + toString(time(NULL) - startTime) + "\t" + toString(params->count) +"\n"); }
555 }
556
557 if((params->endLine-1) % 100 != 0){ params->m->mothurOutJustToScreen(toString(params->endLine-1) + "\t" + toString(time(NULL) - startTime) + "\t" + toString(params->count) +"\n"); }
558
559 outFile.close();
560 delete distCalculator;
561 }
562 catch(exception& e) {
563 params->m->errorOut(e, "DistanceCommand", "driverSquare");
564 exit(1);
565 }
566 }
567 /**************************************************************************************************/
driverFitCalc(distanceData * params)568 void driverFitCalc(distanceData* params){
569 try {
570 ValidCalculators validCalculator;
571 DistCalc* distCalculator;
572
573 if (!params->prot) {
574 if (params->countends) {
575 if (validCalculator.isValidCalculator("distance", params->calc) ) {
576 if (params->calc == "nogaps") { distCalculator = new ignoreGaps(params->cutoff); }
577 else if (params->calc == "eachgap") { distCalculator = new eachGapDist(params->cutoff); }
578 else if (params->calc == "onegap") { distCalculator = new oneGapDist(params->cutoff); }
579 }
580 }else {
581 if (validCalculator.isValidCalculator("distance", params->calc) ) {
582 if (params->calc == "nogaps") { distCalculator = new ignoreGaps(params->cutoff); }
583 else if (params->calc == "eachgap"){ distCalculator = new eachGapIgnoreTermGapDist(params->cutoff); }
584 else if (params->calc == "onegap") { distCalculator = new oneGapIgnoreTermGapDist(params->cutoff); }
585 }
586 }
587 }else {
588 if (validCalculator.isValidCalculator("protdist", params->calc) ) {
589 if (params->calc == "jtt") { distCalculator = new JTT(params->cutoff); }
590 else if (params->calc == "pmb") { distCalculator = new PMB(params->cutoff); }
591 else if (params->calc == "pam") { distCalculator = new PAM(params->cutoff); }
592 else if (params->calc == "kimura") { distCalculator = new Kimura(params->cutoff); }
593 }
594 }
595
596 int startTime = time(NULL);
597 params->count = 0;
598 string buffer = "";
599 for(int i=params->startLine;i<params->endLine;i++){
600
601 Sequence seqI; Protein seqIP; string nameI = "";
602 if (params->prot) { seqIP = params->oldFastaDB->getProt(i); nameI = seqIP.getName(); }
603 else { seqI = params->oldFastaDB->getSeq(i); nameI = seqI.getName(); }
604
605
606 for(int j = 0; j < params->db->getNumSeqs(); j++){
607
608 if (params->m->getControl_pressed()) { break; }
609
610 double dist = 1.0; string nameJ = "";
611
612 if (params->prot) { Protein seqJP = params->db->getProt(j); nameJ = seqJP.getName(); dist = distCalculator->calcDist(seqIP, seqJP); }
613 else { Sequence seqJ = params->db->getSeq(j); nameJ = seqJ.getName(); dist = distCalculator->calcDist(seqI, seqJ); }
614
615 if(dist <= params->cutoff){
616 buffer += nameI + " " + nameJ + " " + toString(dist) + "\n";
617 params->count++;
618 }
619 }
620
621 if(i % 100 == 0){ params->threadWriter->write(buffer); buffer = ""; params->m->mothurOutJustToScreen(toString(i) + "\t" + toString(time(NULL) - startTime) + "\t" + toString(params->count) +"\n"); }
622
623 }
624 params->threadWriter->write(buffer);
625
626 if((params->endLine-1) % 100 != 0){ params->m->mothurOutJustToScreen(toString(params->endLine-1) + "\t" + toString(time(NULL) - startTime) + "\t" + toString(params->count) +"\n"); }
627
628 delete distCalculator;
629
630 }
631 catch(exception& e) {
632 params->m->errorOut(e, "DistanceCommand", "driverFitCalc");
633 exit(1);
634 }
635 }
636 /**************************************************************************************************/
createProcesses(string filename)637 void DistanceCommand::createProcesses(string filename) {
638 try {
639 long long num = db->getNumSeqs();
640 long long distsBelowCutoff = 0;
641 time_t start, end;
642 time(&start);
643
644 //create array of worker threads
645 vector<std::thread*> workerThreads;
646 vector<distanceData*> data;
647
648 double numDists = 0;
649
650 if (output == "square") { numDists = numSeqs; }
651 else { for(int i=0;i<numSeqs;i++){ for(int j=0;j<i;j++){ numDists++; if (numDists > processors) { break; } } } }
652 if (numDists < processors) { processors = numDists; }
653
654 vector<linePair> lines;
655 for (int i = 0; i < processors; i++) {
656 linePair tempLine;
657 lines.push_back(tempLine);
658 if (output != "square") {
659 lines[i].start = int (sqrt(float(i)/float(processors)) * numSeqs);
660 lines[i].end = int (sqrt(float(i+1)/float(processors)) * numSeqs);
661 }else{
662 lines[i].start = int ((float(i)/float(processors)) * numSeqs);
663 lines[i].end = int ((float(i+1)/float(processors)) * numSeqs);
664 }
665 }
666
667 auto synchronizedOutputFile = std::make_shared<SynchronizedOutputFile>(filename);
668 synchronizedOutputFile->setFixedShowPoint(); synchronizedOutputFile->setPrecision(4);
669
670 StorageDatabase* oldFastaDB;
671 if (fitCalc) {
672 ifstream inFASTA; util.openInputFile(oldfastafile, inFASTA);
673 if (!prot) { oldFastaDB = new SequenceDB(inFASTA); }
674 else { oldFastaDB = new ProteinDB(inFASTA); }
675 inFASTA.close();
676
677 lines.clear();
678 if (processors > oldFastaDB->getNumSeqs()) { processors = oldFastaDB->getNumSeqs(); }
679 int remainingSeqs = oldFastaDB->getNumSeqs();
680 int startIndex = 0;
681 for (int remainingProcessors = processors; remainingProcessors > 0; remainingProcessors--) {
682 int numSeqsToFit = remainingSeqs; //case for last processor
683 if (remainingProcessors != 1) { numSeqsToFit = ceil(remainingSeqs / remainingProcessors); }
684 lines.push_back(linePair(startIndex, (startIndex+numSeqsToFit))); //startIndex, endIndex
685 startIndex = startIndex + numSeqsToFit;
686 remainingSeqs -= numSeqsToFit;
687 }
688 }
689
690 //Lauch worker threads
691 for (int i = 0; i < processors-1; i++) {
692 OutputWriter* threadWriter = NULL;
693 distanceData* dataBundle = NULL;
694 string extension = toString(i+1) + ".temp";
695 if (output == "column") {
696 threadWriter = new OutputWriter(synchronizedOutputFile);
697 dataBundle = new distanceData(threadWriter);
698 }else { dataBundle = new distanceData(filename+extension); }
699 dataBundle->setVariables(lines[i+1].start, lines[i+1].end, cutoff, db, oldFastaDB, calc, prot, numNewFasta, countends);
700 data.push_back(dataBundle);
701
702 std::thread* thisThread = NULL;
703 if (output == "column") {
704 if (fitCalc) { thisThread = new std::thread(driverFitCalc, dataBundle); }
705 else { thisThread = new std::thread(driverColumn, dataBundle); }
706 }
707 else if (output == "lt") { thisThread = new std::thread(driverLt, dataBundle); }
708 else { thisThread = new std::thread(driverSquare, dataBundle); }
709 workerThreads.push_back(thisThread);
710 }
711
712 OutputWriter* threadWriter = NULL;
713 distanceData* dataBundle = NULL;
714 if (output == "column") {
715 threadWriter = new OutputWriter(synchronizedOutputFile);
716 dataBundle = new distanceData(threadWriter);
717 }else { dataBundle = new distanceData(filename); }
718 dataBundle->setVariables(lines[0].start, lines[0].end, cutoff, db, oldFastaDB, calc, prot, numNewFasta, countends);
719
720 if (output == "column") {
721 if (fitCalc) { driverFitCalc(dataBundle); }
722 else { driverColumn(dataBundle); }
723 }
724 else if (output == "lt") { driverLt(dataBundle); }
725 else { driverSquare(dataBundle); }
726 distsBelowCutoff = dataBundle->count;
727
728
729 for (int i = 0; i < processors-1; i++) {
730 workerThreads[i]->join();
731
732 distsBelowCutoff += data[i]->count;
733 if (output == "column") { delete data[i]->threadWriter; }
734 else {
735 string extension = toString(i+1) + ".temp";
736 util.appendFiles((filename+extension), filename);
737 util.mothurRemove(filename+extension);
738 }
739 delete data[i];
740 delete workerThreads[i];
741 }
742 if (output == "column") { synchronizedOutputFile->close(); delete threadWriter; }
743 delete dataBundle;
744
745 time(&end);
746 m->mothurOut("\nIt took " + toString(difftime(end, start)) + " secs to find distances for " + toString(num) + " sequences. " + toString(distsBelowCutoff+numDistsBelowCutoff) + " distances below cutoff " + toString(cutoff) + ".\n\n");
747
748 }
749 catch(exception& e) {
750 m->errorOut(e, "DistanceCommand", "createProcesses");
751 exit(1);
752 }
753 }
754
755 /**************************************************************************************************/
756 //its okay if the column file does not contain all the names in the fasta file, since some distance may have been above a cutoff,
757 //but no sequences can be in the column file that are not in oldfasta. also, if a distance is above the cutoff given then remove it.
758 //also check to make sure the 2 files have the same alignment length.
sanityCheck()759 bool DistanceCommand::sanityCheck() {
760 try{
761 bool good = true;
762
763 //make sure the 2 fasta files have the same alignment length
764 ifstream in; util.openInputFile(fastafile, in);
765 int fastaAlignLength = 0;
766
767 if (in) {
768 if (!prot) {
769 Sequence tempIn(in);
770 fastaAlignLength = tempIn.getAligned().length();
771 }else {
772 Protein tempIn(in);
773 fastaAlignLength = tempIn.getAligned().size();
774 }
775 }
776 in.close();
777
778 ifstream in2; util.openInputFile(oldfastafile, in2);
779 int oldfastaAlignLength = 0;
780 if (in2) {
781 if (!prot) {
782 Sequence tempIn(in2);
783 oldfastaAlignLength = tempIn.getAligned().length();
784 }else {
785 Protein tempIn(in2);
786 oldfastaAlignLength = tempIn.getAligned().size();
787 }
788 }
789 in2.close();
790
791 if (fastaAlignLength != oldfastaAlignLength) { m->mothurOut("fasta files do not have the same alignment length.\n"); return false; }
792
793 //read fasta file and save names as well as adding them to the alignDB
794 set<string> namesOldFasta;
795
796 ifstream inFasta; util.openInputFile(oldfastafile, inFasta);
797
798 while (!inFasta.eof()) {
799 if (m->getControl_pressed()) { inFasta.close(); return good; }
800
801 if (!prot) {
802 Sequence temp(inFasta); util.gobble(inFasta);
803 if (temp.getName() != "") {
804 namesOldFasta.insert(temp.getName()); //save name
805 if (!fitCalc) { db->push_back(temp); }//add to DB
806 }
807 }else {
808 Protein temp(inFasta); util.gobble(inFasta);
809 if (temp.getName() != "") {
810 namesOldFasta.insert(temp.getName()); //save name
811 if (!fitCalc) { db->push_back(temp); }//add to DB
812 }
813 }
814 }
815 inFasta.close();
816
817 //read through the column file checking names and removing distances above the cutoff
818 ifstream inDist;
819 util.openInputFile(column, inDist);
820
821 ofstream outDist;
822 string outputFile = column + ".temp";
823 util.openOutputFile(outputFile, outDist);
824
825 string name1, name2;
826 float dist;
827 while (!inDist.eof()) {
828 if (m->getControl_pressed()) { inDist.close(); outDist.close(); util.mothurRemove(outputFile); return good; }
829
830 inDist >> name1; util.gobble(inDist); inDist >> name2; util.gobble(inDist); inDist >> dist; util.gobble(inDist);
831
832 //both names are in fasta file and distance is below cutoff
833 if ((namesOldFasta.count(name1) == 0) || (namesOldFasta.count(name2) == 0)) { good = false; break; }
834 else{ if (dist <= cutoff) { numDistsBelowCutoff++; outDist << name1 << '\t' << name2 << '\t' << dist << endl; } }
835 }
836
837 inDist.close();
838 outDist.close();
839
840 if (good) {
841 util.mothurRemove(column);
842 rename(outputFile.c_str(), column.c_str());
843 }else{
844 util.mothurRemove(outputFile); //temp file is bad because file mismatch above
845 }
846
847 return good;
848
849 }
850 catch(exception& e) {
851 m->errorOut(e, "DistanceCommand", "sanityCheck");
852 exit(1);
853 }
854 }
855 /**************************************************************************************************/
856
857
858
859
860