1 /*
2  *  unifracweightedcommand.cpp
3  *  Mothur
4  *
5  *  Created by Sarah Westcott on 2/9/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9 
10 #include "unifracweightedcommand.h"
11 #include "consensus.h"
12 #include "subsample.h"
13 #include "treereader.h"
14 
15 //**********************************************************************************************************************
setParameters()16 vector<string> UnifracWeightedCommand::setParameters(){
17 	try {
18 		CommandParameter ptree("tree", "InputTypes", "", "", "none", "none", "none","weighted-wsummary",false,true,true); parameters.push_back(ptree);
19         CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none","",false,false,true); parameters.push_back(pname);
20         CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none","",false,false,true); parameters.push_back(pcount);
21 		CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none","",false,false,true); parameters.push_back(pgroup);
22 		CommandParameter pgroups("groups", "String", "", "", "", "", "","",false,false); parameters.push_back(pgroups);
23 		CommandParameter piters("iters", "Number", "", "1000", "", "", "","",false,false); parameters.push_back(piters);
24 		CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
25         CommandParameter psubsample("subsample", "String", "", "", "", "", "","",false,false); parameters.push_back(psubsample);
26         CommandParameter pwithreplacement("withreplacement", "Boolean", "", "F", "", "", "","",false,false,true); parameters.push_back(pwithreplacement);
27         CommandParameter pconsensus("consensus", "Boolean", "", "F", "", "", "","tree",false,false); parameters.push_back(pconsensus);
28         CommandParameter prandom("random", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(prandom);
29 		CommandParameter pdistance("distance", "Multiple", "column-lt-square-phylip", "column", "", "", "","phylip-column",false,false); parameters.push_back(pdistance);
30 		CommandParameter proot("root", "Boolean", "F", "", "", "", "","",false,false); parameters.push_back(proot);
31 		CommandParameter pseed("seed", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pseed);
32         CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
33 		CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
34 
35         vector<string> tempOutNames;
36         outputTypes["weighted"] = tempOutNames;
37         outputTypes["wsummary"] = tempOutNames;
38         outputTypes["phylip"] = tempOutNames;
39         outputTypes["column"] = tempOutNames;
40         outputTypes["tree"] = tempOutNames;
41 
42         abort = false; calledHelp = false;
43 
44 		vector<string> myArray;
45 		for (int i = 0; i < parameters.size(); i++) {	myArray.push_back(parameters[i].name);		}
46 		return myArray;
47 	}
48 	catch(exception& e) {
49 		m->errorOut(e, "UnifracWeightedCommand", "setParameters");
50 		exit(1);
51 	}
52 }
53 //**********************************************************************************************************************
getHelpString()54 string UnifracWeightedCommand::getHelpString(){
55 	try {
56 		string helpString = "";
57 		helpString += "The unifrac.weighted command parameters are tree, group, name, count, groups, iters, distance, processors, root, subsample, consensus and random.  tree parameter is required unless you have valid current tree file.\n";
58 		helpString += "The groups parameter allows you to specify which of the groups in your groupfile you would like analyzed.  You must enter at least 2 valid groups.\n";
59 		helpString += "The group names are separated by dashes.  The iters parameter allows you to specify how many random trees you would like compared to your tree.\n";
60 		helpString += "The distance parameter allows you to create a distance file from the results. The default is false.\n";
61 		helpString += "The random parameter allows you to shut off the comparison to random trees. The default is false, meaning don't compare your trees with randomly generated trees.\n";
62 		helpString += "The root parameter allows you to include the entire root in your calculations. The default is false, meaning stop at the root for this comparision instead of the root of the entire tree.\n";
63 		helpString += "The processors parameter allows you to specify the number of processors to use. The default is 1.\n";
64         helpString += "The subsample parameter allows you to enter the size pergroup of the sample or you can set subsample=T and mothur will use the size of your smallest group. The subsample parameter may only be used with a group file.\n";
65         helpString += "The withreplacement parameter allows you to indicate you want to subsample your data allowing for the same read to be included multiple times. Default=f. \n";
66         helpString += "The consensus parameter allows you to indicate you would like trees built from distance matrices created with the results, as well as a consensus tree built from these trees. Default=F.\n";
67         helpString += "The unifrac.weighted command should be in the following format: unifrac.weighted(groups=yourGroups, iters=yourIters).\n";
68 		helpString += "Example unifrac.weighted(groups=A-B-C, iters=500).\n";
69 		helpString += "The default value for groups is all the groups in your groupfile, and iters is 1000.\n";
70 		helpString += "The unifrac.weighted command output two files: .weighted and .wsummary their descriptions are in the manual.\n";
71 
72 		return helpString;
73 	}
74 	catch(exception& e) {
75 		m->errorOut(e, "UnifracWeightedCommand", "getHelpString");
76 		exit(1);
77 	}
78 }
79 //**********************************************************************************************************************
getOutputPattern(string type)80 string UnifracWeightedCommand::getOutputPattern(string type) {
81     try {
82         string pattern = "";
83         if (type == "weighted")            {  pattern = "[filename],weighted-[filename],[tag],weighted";   }
84         else if (type == "wsummary")        {  pattern = "[filename],[tag],wsummary";   }
85         else if (type == "phylip")           {  pattern = "[filename],[tag],[tag2],dist";   }
86         else if (type == "column")           {  pattern = "[filename],[tag],[tag2],dist";   }
87         else if (type == "tree")             {  pattern = "[filename],[tag],[tag2],tre";   }
88         else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->setControl_pressed(true);  }
89 
90         return pattern;
91     }
92     catch(exception& e) {
93         m->errorOut(e, "UnifracWeightedCommand", "getOutputPattern");
94         exit(1);
95     }
96 }
97 /***********************************************************/
UnifracWeightedCommand(string option)98 UnifracWeightedCommand::UnifracWeightedCommand(string option) : Command() {
99 	try {
100 
101 		if(option == "help") { help(); abort = true; calledHelp = true; }
102 		else if(option == "citation") { citation(); abort = true; calledHelp = true;}
103         else if(option == "category") {  abort = true; calledHelp = true;  }
104 
105 		else {
106 			OptionParser parser(option, setParameters());
107 			map<string,string> parameters=parser.getParameters();
108 
109 			ValidParameters validParameter;
110 			treefile = validParameter.validFile(parameters, "tree");
111 			if (treefile == "not open") { treefile = ""; abort = true; }
112 			else if (treefile == "not found") { 				//if there is a current design file, use it
113 				treefile = current->getTreeFile();
114 				if (treefile != "") { m->mothurOut("Using " + treefile + " as input file for the tree parameter.\n");  }
115 				else { 	m->mothurOut("You have no current tree file and the tree parameter is required.\n");  abort = true; }
116 			}else { current->setTreeFile(treefile); }
117 
118 			//check for required parameters
119 			groupfile = validParameter.validFile(parameters, "group");
120 			if (groupfile == "not open") { abort = true; }
121 			else if (groupfile == "not found") { groupfile = ""; }
122 			else { current->setGroupFile(groupfile); }
123 
124 			namefile = validParameter.validFile(parameters, "name");
125 			if (namefile == "not open") { namefile = ""; abort = true; }
126 			else if (namefile == "not found") { namefile = ""; }
127 			else { current->setNameFile(namefile); }
128 
129             countfile = validParameter.validFile(parameters, "count");
130 			if (countfile == "not open") { countfile = ""; abort = true; }
131 			else if (countfile == "not found") { countfile = "";  }
132 			else { current->setCountFile(countfile); }
133 
134             if ((namefile != "") && (countfile != "")) {
135                 m->mothurOut("[ERROR]: you may only use one of the following: name or count.\n");  abort = true;
136             }
137 
138             if ((groupfile != "") && (countfile != "")) {
139                 m->mothurOut("[ERROR]: you may only use one of the following: group or count.\n");  abort=true;
140             }
141 
142 					if (outputdir == ""){    outputdir = util.hasPath(treefile);	}
143 
144 
145 			//check for optional parameter and set defaults
146 			// ...at some point should added some additional type checking...
147 			groups = validParameter.valid(parameters, "groups");
148 			if (groups == "not found") { groups = ""; }
149 			else {
150 				util.splitAtDash(groups, Groups);
151                 if (Groups.size() != 0) { if (Groups[0]== "all") { Groups.clear(); } }
152 			}
153 
154 			itersString = validParameter.valid(parameters, "iters");			if (itersString == "not found") { itersString = "1000"; }
155 			util.mothurConvert(itersString, iters);
156 
157 			string temp = validParameter.valid(parameters, "distance");
158 			if (temp == "not found") { phylip = false; outputForm = ""; }
159 			else{
160                 if (temp=="phylip") { temp = "lt"; }
161 				if ((temp == "lt") || (temp == "column") || (temp == "square")) {  phylip = true;  outputForm = temp; }
162 				else { m->mothurOut("Options for distance are: lt, square, or column. Using lt.\n");  phylip = true; outputForm = "lt"; }
163 			}
164 
165 			temp = validParameter.valid(parameters, "random");				if (temp == "not found") { temp = "F"; }
166 			random = util.isTrue(temp);
167 
168 			temp = validParameter.valid(parameters, "root");					if (temp == "not found") { temp = "F"; }
169 			includeRoot = util.isTrue(temp);
170 
171 			temp = validParameter.valid(parameters, "processors");	if (temp == "not found"){	temp = current->getProcessors();	}
172             processors = current->setProcessors(temp);
173 
174             temp = validParameter.valid(parameters, "subsample");		if (temp == "not found") { temp = "F"; }
175 			if (util.isNumeric1(temp)) { util.mothurConvert(temp, subsampleSize); subsample = true; }
176             else {
177                 if (util.isTrue(temp)) { subsample = true; subsampleSize = -1; }  //we will set it to smallest group later
178                 else { subsample = false; }
179             }
180 
181             if (!subsample) { subsampleIters = 0;   }
182             else { subsampleIters = iters;          }
183 
184             temp = validParameter.valid(parameters, "withreplacement");		if (temp == "not found"){	temp = "f";		}
185             withReplacement = util.isTrue(temp);
186 
187             temp = validParameter.valid(parameters, "consensus");					if (temp == "not found") { temp = "F"; }
188 			consensus = util.isTrue(temp);
189 
190 			if (subsample && random) {  m->mothurOut("[ERROR]: random must be false, if subsample=t.\n"); abort=true;  }
191 			if (countfile == "") { if (subsample && (groupfile == "")) {  m->mothurOut("[ERROR]: if subsample=t, a group file must be provided.\n"); abort=true;  } }
192             else {
193                 CountTable testCt;
194                 if ((!testCt.testGroups(countfile)) && (subsample)) {
195                     m->mothurOut("[ERROR]: if subsample=t, a count file with group info must be provided.\n"); abort=true;
196                 }
197             }
198             if (subsample && (!phylip)) { phylip=true; outputForm = "lt"; }
199             if (consensus && (!subsample)) { m->mothurOut("[ERROR]: you cannot use consensus without subsample.\n"); abort=true; }
200 
201 			if (countfile=="") {
202                 if (namefile == "") {
203                     vector<string> files; files.push_back(treefile);
204                     if (!current->getMothurCalling())  {  parser.getNameFile(files);  }
205                 }
206             }
207 		}
208 	}
209 	catch(exception& e) {
210 		m->errorOut(e, "UnifracWeightedCommand", "UnifracWeightedCommand");
211 		exit(1);
212 	}
213 }
214 /***********************************************************/
execute()215 int UnifracWeightedCommand::execute() {
216 	try {
217 		if (abort) { if (calledHelp) { return 0; }  return 2;	}
218 
219         long start = time(NULL);
220 
221         TreeReader* reader;
222         if (countfile == "") { reader = new TreeReader(treefile, groupfile, namefile); }
223         else { reader = new TreeReader(treefile, countfile); }
224         vector<Tree*> T = reader->getTrees();
225         CountTable* ct; ct = T[0]->getCountTable();
226         if ((Groups.size() == 0) || (Groups.size() < 2)) {  Groups = ct->getNamesOfGroups();  } //must have at least 2 groups to compare
227         delete reader;
228 
229         if (m->getControl_pressed()) {  delete ct; for (int i = 0; i < T.size(); i++) { delete T[i]; } return 0; }
230 
231         map<string, string> variables;
232 		vector<string> nameGroups = ct->getNamesOfGroups();
233         if (Groups.size() < 2) { m->mothurOut("[ERROR]: You cannot run unifrac.weighted with less than 2 groups, aborting.\n"); delete ct; for (int i = 0; i < T.size(); i++) { delete T[i]; } return 0; }
234         if (m->getControl_pressed()) {  delete ct; for (int i = 0; i < T.size(); i++) { delete T[i]; } return 0; }
235 
236         if ((Groups.size() == 0) || (Groups.size() < 2)) {  Groups = ct->getNamesOfGroups();  } //must have at least 2 groups to compare
237 
238         //set or check size
239         if (subsample) {
240             //user has not set size, set size = smallest samples size
241             if (subsampleSize == -1) {
242                 subsampleSize = ct->getNumSeqsSmallestGroup();
243                 m->mothurOut("\nSetting subsample size to " + toString(subsampleSize) + ".\n\n");
244             }else { //eliminate any too small groups
245                 vector<string> newGroups = Groups;
246                 Groups.clear();
247                 for (int i = 0; i < newGroups.size(); i++) {
248                     int thisSize = ct->getGroupCount(newGroups[i]);
249 
250                     if (thisSize >= subsampleSize) {    Groups.push_back(newGroups[i]);	}
251                     else {   m->mothurOut("You have selected a size that is larger than "+newGroups[i]+" number of sequences, removing "+newGroups[i]+".\n"); }
252                 }
253             }
254         }
255 
256         vector<string> groupComb; util.getCombos(groupComb, Groups, numComp); //here in case some groups are removed by subsample
257 
258         if (numComp < processors) { processors = numComp; m->mothurOut("Reducing processors to " + toString(numComp) + ".\n"); }
259         if (consensus && (numComp < 2)) { m->mothurOut("consensus can only be used with numComparisions greater than 1, setting consensus=f.\n"); consensus=false; }
260 
261         Weighted weighted(includeRoot, Groups);
262 
263         for (int i = 0; i < T.size(); i++) {
264 
265             if (m->getControl_pressed()) { break; }
266 
267             vector<double> WScoreSig;  //tree weighted score signifigance when compared to random trees - percentage of random trees with that score or lower.
268             vector< vector<double> > uScores;  uScores.resize(numComp);  //data[0] = weightedscore AB, data[1] = weightedscore AC...
269             vector<double> userData; userData.resize(numComp,0);  //weighted score info for user tree. data[0] = weightedscore AB, data[1] = weightedscore AC...
270             vector<double> randomData; randomData.resize(numComp,0); //weighted score info for random trees. data[0] = weightedscore AB, data[1] = weightedscore AC...
271 
272             userData = weighted.getValues(T[i], processors, outputdir); //userData[0] = weightedscore
273             if (m->getControl_pressed()) { break; }
274 
275             if (phylip) {	createPhylipFile((i+1), userData);          }
276             if (random) { runRandomCalcs(T[i], ct, userData, (i+1), WScoreSig, groupComb);    }
277             printWSummaryFile((i+1), userData, WScoreSig, groupComb);
278 
279             if (m->getControl_pressed()) { break; }
280 
281             //subsample loop
282             vector< vector<double> > calcDistsTotals;  //each iter, each groupCombos dists. this will be used to make .dist files
283             SubSample sample;
284             for (int thisIter = 0; thisIter < subsampleIters; thisIter++) { //subsampleIters=0, if subsample=f.
285                 if (m->getControl_pressed()) { break; }
286 
287                 //uses method of setting groups to doNotIncludeMe
288                 //copy to preserve old one - would do this in subsample but memory cleanup becomes messy.
289                 CountTable* newCt = new CountTable();
290                 int sampleTime = time(NULL);
291 
292                 Tree* subSampleTree;
293                 if (withReplacement)    { subSampleTree = sample.getSampleWithReplacement(T[i], ct, newCt, subsampleSize, Groups);  }
294                 else                    { subSampleTree = sample.getSample(T[i], ct, newCt, subsampleSize, Groups);                 }
295 
296                 if (m->getDebug()) { m->mothurOut("[DEBUG]: iter " + toString(thisIter) + " took " + toString(time(NULL) - sampleTime) + " seconds to sample tree.\n"); }
297 
298                 //call new weighted function
299                 vector<double> iterData; iterData.resize(numComp,0);
300                 Weighted thisWeighted(includeRoot, Groups);
301                 iterData = thisWeighted.getValues(subSampleTree, processors, outputdir); //userData[0] = weightedscore
302 
303                 //save data to make ave dist, std dist
304                 calcDistsTotals.push_back(iterData);
305 
306                 delete newCt; delete subSampleTree;
307 
308                 if((thisIter+1) % 100 == 0){	m->mothurOutJustToScreen(toString(thisIter+1)+"\n"); 	}
309             }
310 
311             if (m->getControl_pressed()) { break; }
312 
313             if (subsample) {  getAverageSTDMatrices(calcDistsTotals, i);    }
314             if (consensus) {  getConsensusTrees(calcDistsTotals, i);        }
315         }
316 		delete ct;  for (int i = 0; i < T.size(); i++) { delete T[i]; }
317 
318 		if (m->getControl_pressed()) { for (int i = 0; i < outputNames.size(); i++) {	util.mothurRemove(outputNames[i]);  } return 0; }
319 
320 		m->mothurOut("It took " + toString(time(NULL) - start) + " secs to run unifrac.weighted.\n");
321 
322 		//set phylip file as new current phylipfile
323 		string currentName = "";
324 		itTypes = outputTypes.find("phylip");
325 		if (itTypes != outputTypes.end()) {
326 			if ((itTypes->second).size() != 0) { currentName = (itTypes->second)[0]; current->setPhylipFile(currentName); }
327 		}
328 
329 		//set column file as new current columnfile
330 		itTypes = outputTypes.find("column");
331 		if (itTypes != outputTypes.end()) {
332 			if ((itTypes->second).size() != 0) { currentName = (itTypes->second)[0]; current->setColumnFile(currentName); }
333 		}
334 
335 		m->mothurOut("\nOutput File Names: \n");
336 		for (int i = 0; i < outputNames.size(); i++) {	m->mothurOut(outputNames[i] +"\n"); 	} m->mothurOutEndLine();
337 
338 		return 0;
339 	}
340 	catch(exception& e) {
341 		m->errorOut(e, "UnifracWeightedCommand", "execute");
342 		exit(1);
343 	}
344 }
345 /**************************************************************************************************/
getAverageSTDMatrices(vector<vector<double>> & dists,int treeNum)346 int UnifracWeightedCommand::getAverageSTDMatrices(vector< vector<double> >& dists, int treeNum) {
347 	try {
348         vector<double> averages = util.getAverages(dists);
349         vector<double> stdDev = util.getStandardDeviation(dists, averages);
350 
351         int numGroups = Groups.size();
352         vector< vector<double> > avedists;
353         for (int i = 0; i < numGroups; i++) {
354             vector<double> temp; temp.resize(numGroups, 0.0);
355             avedists.push_back(temp);
356         }
357 
358         //make matrix with scores in it
359         vector< vector<double> > stddists;	//stddists.resize(m->getNumGroups());
360         for (int i = 0; i < numGroups; i++) {
361             vector<double> temp; for (int j = 0; j < numGroups; j++) { temp.push_back(0.0); }
362             stddists.push_back(temp);
363         }
364 
365         //flip it so you can print it
366         int count = 0;
367         for (int r=0; r< numGroups; r++) {
368             for (int l = 0; l < r; l++) {
369                 avedists[r][l] = averages[count];
370                 avedists[l][r] = averages[count];
371                 stddists[r][l] = stdDev[count];
372                 stddists[l][r] = stdDev[count];
373                 count++;
374             }
375         }
376 
377         map<string, string> variables;
378 		variables["[filename]"] = outputdir + util.getSimpleName(treefile);
379         variables["[tag]"] = toString(treeNum+1);
380         variables["[tag2]"] = "weighted.ave";
381         string aveFileName = getOutputFileName("phylip",variables);
382         if (outputForm != "column") { outputNames.push_back(aveFileName); outputTypes["phylip"].push_back(aveFileName);  }
383         else { outputNames.push_back(aveFileName); outputTypes["column"].push_back(aveFileName);  }
384         ofstream out; util.openOutputFile(aveFileName, out);
385 
386         variables["[tag2]"] = "weighted.std";
387         string stdFileName = getOutputFileName("phylip",variables);
388         if (outputForm != "column") { outputNames.push_back(stdFileName); outputTypes["phylip"].push_back(stdFileName); }
389         else { outputNames.push_back(stdFileName); outputTypes["column"].push_back(stdFileName); }
390         ofstream outStd; util.openOutputFile(stdFileName, outStd);
391 
392         if ((outputForm == "lt") || (outputForm == "square")) {
393             //output numSeqs
394             out << numGroups << endl;
395             outStd << numGroups << endl;
396         }
397 
398         //output to file
399         for (int r=0; r< numGroups; r++) {
400             string name = Groups[r];
401             if (name.length() < 10) { while (name.length() < 10) {  name += " ";  } } //pad with spaces to make compatible
402 
403             if (outputForm == "lt") {
404                 out << name; outStd << name;
405                 for (int l = 0; l < r; l++) {	out  << '\t' << avedists[r][l];  outStd   << '\t' << stddists[r][l];}  //output distances
406                 out << endl;  outStd << endl;
407             }else if (outputForm == "square") {
408                 out << name; outStd << name;
409                 for (int l = 0; l < numGroups; l++) {	out  << '\t' << avedists[r][l]; outStd  << '\t' << stddists[r][l]; }  //output distances
410                 out << endl; outStd << endl;
411             }else{
412                 for (int l = 0; l < r; l++) {
413                     string otherName = Groups[l];
414                     if (otherName.length() < 10) { while (otherName.length() < 10) {  otherName += " ";  } }  //pad with spaces to make compatible
415 
416                     out  << name << '\t' << otherName  << '\t' << avedists[r][l] << endl;  //output distances
417                     outStd  << name << '\t' << otherName  << '\t' << stddists[r][l] << endl;
418                 }
419             }
420         }
421         out.close(); outStd.close();
422 
423         return 0;
424     }
425 	catch(exception& e) {
426 		m->errorOut(e, "UnifracWeightedCommand", "getAverageSTDMatrices");
427 		exit(1);
428 	}
429 }
430 /**************************************************************************************************/
getConsensusTrees(vector<vector<double>> & dists,int treeNum)431 int UnifracWeightedCommand::getConsensusTrees(vector< vector<double> >& dists, int treeNum) {
432 	try {
433         ///create treemap class from groupmap for tree class to use
434         CountTable newCt;
435         set<string> nameMap;
436         map<string, string> groupMap;
437         set<string> gps;
438         int numGroups = Groups.size();
439         for (int i = 0; i < numGroups; i++) {
440             nameMap.insert(Groups[i]);
441             gps.insert(Groups[i]);
442             groupMap[Groups[i]] = Groups[i];
443         }
444         newCt.createTable(nameMap, groupMap, gps);
445 
446         vector<Tree*> newTrees = buildTrees(dists, treeNum, newCt); //also creates .all.tre file containing the trees created
447 
448         if (m->getControl_pressed()) { return 0; }
449 
450         Consensus con;
451         Tree* conTree = con.getTree(newTrees);
452 
453         //create a new filename
454         map<string, string> variables;
455 		variables["[filename]"] = outputdir + util.getRootName(util.getSimpleName(treefile));
456         variables["[tag]"] = toString(treeNum+1);
457         variables["[tag2]"] = "weighted.cons";
458         string conFile = getOutputFileName("tree",variables);
459         outputNames.push_back(conFile); outputTypes["tree"].push_back(conFile);
460         ofstream outTree;
461         util.openOutputFile(conFile, outTree);
462 
463         if (conTree != NULL) { conTree->print(outTree, "boot"); delete conTree; } outTree.close();
464 
465         return 0;
466     }
467 	catch(exception& e) {
468 		m->errorOut(e, "UnifracWeightedCommand", "getConsensusTrees");
469 		exit(1);
470 	}
471 }
472 /**************************************************************************************************/
buildTrees(vector<vector<double>> & dists,int treeNum,CountTable & myct)473 vector<Tree*> UnifracWeightedCommand::buildTrees(vector< vector<double> >& dists, int treeNum, CountTable& myct) {
474 	try {
475         vector<Tree*> trees;
476 
477         //create a new filename
478         map<string, string> variables;
479 		variables["[filename]"] = outputdir + util.getRootName(util.getSimpleName(treefile));
480         variables["[tag]"] = toString(treeNum+1);
481         variables["[tag2]"] = "weighted.all";
482         string outputFile = getOutputFileName("tree",variables);
483         outputNames.push_back(outputFile); outputTypes["tree"].push_back(outputFile);
484 
485         ofstream outAll;
486         util.openOutputFile(outputFile, outAll);
487         int numGroups = Groups.size();
488 
489         for (int i = 0; i < dists.size(); i++) { //dists[0] are the dists for the first subsampled tree.
490 
491             if (m->getControl_pressed()) { break; }
492 
493             //make matrix with scores in it
494             vector< vector<double> > sims;	sims.resize(numGroups);
495             for (int j = 0; j < numGroups; j++) { sims[j].resize(numGroups, 0.0); }
496 
497             int count = 0;
498 			for (int r=0; r<numGroups; r++) {
499 				for (int l = 0; l < r; l++) {
500                     double sim = -(dists[i][count]-1.0);
501 					sims[r][l] = sim;
502 					sims[l][r] = sim;
503 					count++;
504 				}
505 			}
506 
507             //create tree
508             Tree* tempTree = new Tree(&myct, sims, Groups);
509             tempTree->assembleTree();
510             trees.push_back(tempTree);
511             tempTree->print(outAll); //print tree
512         }
513         outAll.close();
514 
515         if (m->getControl_pressed()) {  for (int i = 0; i < trees.size(); i++) {  delete trees[i]; trees[i] = NULL; } util.mothurRemove(outputFile); }
516 
517         return trees;
518     }
519 	catch(exception& e) {
520 		m->errorOut(e, "UnifracWeightedCommand", "buildTrees");
521 		exit(1);
522 	}
523 }
524 /**************************************************************************************************/
runRandomCalcs(Tree * thisTree,CountTable * ct,vector<double> usersScores,int iter,vector<double> & WScoreSig,vector<string> groupComb)525 int UnifracWeightedCommand::runRandomCalcs(Tree* thisTree, CountTable* ct, vector<double> usersScores, int iter, vector<double>& WScoreSig, vector<string> groupComb) {
526 	try {
527         map<string, string> variables;
528         variables["[filename]"] = outputdir + util.getSimpleName(treefile);
529         variables["[tag]"] = toString(iter);
530         string wFileName = getOutputFileName("weighted", variables);
531         ColumnFile output(wFileName, itersString); ofstream out; util.openOutputFile(wFileName, out); out.close();
532         outputNames.push_back(wFileName); outputTypes["weighted"].push_back(wFileName);
533 
534         //calculate number of comparisons i.e. with groups A,B,C = AB, AC, BC = 3;
535         vector< vector<string> > namesOfGroupCombos;
536         int numGroups = Groups.size();
537         for (int a=0; a<numGroups; a++) {
538             for (int l = 0; l < a; l++) {
539                 vector<string> groups; groups.push_back(Groups[a]); groups.push_back(Groups[l]);
540                 namesOfGroupCombos.push_back(groups);
541             }
542         }
543 
544         vector<vector<int> > randomTreeNodes;
545         for (int f = 0; f < numComp; f++) { randomTreeNodes.push_back(thisTree->getNodes(namesOfGroupCombos[f])); }
546         vector<vector<int> > savedRandomTreeNodes = randomTreeNodes;
547 
548         //get scores for random trees
549         vector<vector<double> > rScores; rScores.resize(numComp);
550         for (int i = 0; i < iters; i++) {
551             if (m->getControl_pressed()) { return 0; }
552             randomTreeNodes = savedRandomTreeNodes;
553 
554             for (int f = 0; f < numComp; f++) {   util.mothurRandomShuffle(randomTreeNodes[f]);   }
555 
556             vector<double> thisItersRScores = createProcesses(thisTree, ct, namesOfGroupCombos, randomTreeNodes);
557 
558             for (int f = 0; f < numComp; f++) {   rScores[f].push_back(thisItersRScores[f]);  }
559 
560             if((i+1) % 100 == 0){	m->mothurOut(toString(i+1)+"\n");		}
561         }
562 
563         //find the signifigance of the score for summary file
564         for (int f = 0; f < numComp; f++) {
565             //sort random scores
566             sort(rScores[f].begin(), rScores[f].end());
567 
568             //the index of the score higher than yours is returned
569             //so if you have 1000 random trees the index returned is 100
570             //then there are 900 trees with a score greater then you.
571             //giving you a signifigance of 0.900
572             int index = findIndex(usersScores[f], f, rScores);    if (index == -1) { m->mothurOut("error in UnifracWeightedCommand\n");  exit(1); } //error code
573 
574             //the signifigance is the number of trees with the users score or higher
575             WScoreSig.push_back((iters-index)/(float)iters);
576         }
577 
578         set<double>  validScores;  //map contains scores from random
579         vector< map<double, double> > rScoreFreq;  //map <weighted score, number of random trees with that score.> -vector entry for each combination.
580         vector< map<double, double> > rCumul;  //map <weighted score, cumulative percentage of number of random trees with that score or higher.> -vector entry for each c
581         calculateFreqsCumuls(validScores, rScores, rScoreFreq, rCumul);
582 
583         vector<string> tags; tags.push_back("Score"); tags.push_back("RandFreq"); tags.push_back("RandCumul");
584         for(int a = 0; a < numComp; a++) {
585             output.setLabelName(groupComb[a], tags);
586             //print each line
587             for (set<double>::iterator it = validScores.begin(); it != validScores.end(); it++) {
588                 vector<double> data; data.push_back(*it);  data.push_back(rScoreFreq[a][*it]); data.push_back(rCumul[a][*it]);
589                 output.updateOutput(data);
590             }
591             output.resetFile();
592         }
593         return 0;
594     }
595 	catch(exception& e) {
596 		m->errorOut(e, "UnifracWeightedCommand", "runRandomCalcs");
597 		exit(1);
598 	}
599 }
600 /***********************************************************************/
601 struct weightedRandomData {
602     bool includeRoot;
603     int count, numComps, start, num;
604     vector<string> Groups, Treenames;
605     vector<double> scores;
606     vector< vector<string> > namesOfGroupCombos;
607     vector<vector<int> > randomizedTreeNodes;
608     MothurOut* m;
609     Tree* t;
610     CountTable* ct;
611     Utils util;
612 
weightedRandomDataweightedRandomData613     weightedRandomData(){}
weightedRandomDataweightedRandomData614     weightedRandomData(int st, int en, vector< vector<string> > ngc, Tree* tree, CountTable* count, bool ir, vector<string> g, vector<vector<int> > randomTreeNodes) {
615         m = MothurOut::getInstance();
616         num = en;
617         start = st;
618         namesOfGroupCombos = ngc;
619         numComps = namesOfGroupCombos.size();
620         randomizedTreeNodes = randomTreeNodes;
621         t = tree;
622         ct = count;
623         includeRoot = ir;
624         Groups = g;
625         Treenames = t->getTreeNames();
626         count = 0;
627     }
628 };
629 /**************************************************************************************************/
driverWeightedRandom(weightedRandomData * params)630 void driverWeightedRandom(weightedRandomData* params) {
631     try {
632         Weighted weighted(params->includeRoot, params->Groups);
633 
634         params->count = 0;
635 
636         Tree* randT = new Tree(params->ct, params->Treenames);
637 
638         for (int h = params->start; h < (params->start+params->num); h++) {
639 
640             if (params->m->getControl_pressed()) { break; }
641 
642             string groupA = params->namesOfGroupCombos[h][0];
643             string groupB = params->namesOfGroupCombos[h][1];
644             vector<int> treeNodesFromTheseGroups = params->randomizedTreeNodes[h];
645 
646             //copy T[i]'s info.
647             randT->getCopy(params->t);
648 
649             //create a random tree with same topology as T[i], but different labels
650             randT->assembleRandomUnifracTree(params->randomizedTreeNodes[h]);
651 
652             if (params->m->getControl_pressed()) { break; }
653 
654             //get wscore of random tree
655             EstOutput randomData = weighted.getValues(randT, groupA, groupB);
656 
657             if (params->m->getControl_pressed()) { break; }
658 
659             //save scores
660             params->scores.push_back(randomData[0]);
661         }
662 
663         delete randT;
664     }
665     catch(exception& e) {
666         params->m->errorOut(e, "UnifracWeightedCommand", "driver");
667         exit(1);
668     }
669 }
670 /**************************************************************************************************/
createProcesses(Tree * t,CountTable * ct,vector<vector<string>> namesOfGroupCombos,vector<vector<int>> & randomizedTreeNodes)671 vector<double> UnifracWeightedCommand::createProcesses(Tree* t, CountTable* ct, vector< vector<string> > namesOfGroupCombos, vector<vector<int> >& randomizedTreeNodes) {
672 	try {
673         //breakdown work between processors
674         vector<linePair> lines;
675         int remainingPairs = namesOfGroupCombos.size();
676         if (remainingPairs < processors) { processors = remainingPairs; }
677         int startIndex = 0;
678         for (int remainingProcessors = processors; remainingProcessors > 0; remainingProcessors--) {
679             int numPairs = remainingPairs; //case for last processor
680             if (remainingProcessors != 1) { numPairs = ceil(remainingPairs / remainingProcessors); }
681             lines.push_back(linePair(startIndex, numPairs)); //startIndex, numPairs
682             startIndex = startIndex + numPairs;
683             remainingPairs = remainingPairs - numPairs;
684         }
685         //create array of worker threads
686         vector<std::thread*> workerThreads;
687         vector<weightedRandomData*> data;
688 
689         //Lauch worker threads
690         for (int i = 0; i < processors-1; i++) {
691             CountTable* copyCount = new CountTable();
692             copyCount->copy(ct);
693             vector<string> Treenames = t->getTreeNames();
694             Tree* copyTree = new Tree(copyCount, Treenames);
695             copyTree->getCopy(t);
696 
697             weightedRandomData* dataBundle = new weightedRandomData(lines[i+1].start, lines[i+1].end, namesOfGroupCombos, copyTree, copyCount, includeRoot, Groups, randomizedTreeNodes);
698             data.push_back(dataBundle);
699             workerThreads.push_back(new std::thread(driverWeightedRandom, dataBundle));
700         }
701 
702         weightedRandomData* dataBundle = new weightedRandomData(lines[0].start, lines[0].end, namesOfGroupCombos, t, ct, includeRoot, Groups, randomizedTreeNodes);
703         driverWeightedRandom(dataBundle);
704         vector<double> scores = dataBundle->scores;
705 
706 
707         for (int i = 0; i < processors-1; i++) {
708             workerThreads[i]->join();
709 
710             scores.insert(scores.end(), data[i]->scores.begin(), data[i]->scores.end());
711 
712             delete data[i]->t; delete data[i]->ct; delete data[i]; delete workerThreads[i];
713         }
714         delete dataBundle;
715         return scores;
716 	}
717 	catch(exception& e) {
718 		m->errorOut(e, "UnifracWeightedCommand", "createProcesses");
719 		exit(1);
720 	}
721 }
722 /***********************************************************/
printWSummaryFile(int treeIndex,vector<double> utreeScores,vector<double> WScoreSig,vector<string> groupComb)723 void UnifracWeightedCommand::printWSummaryFile(int treeIndex, vector<double> utreeScores, vector<double> WScoreSig, vector<string> groupComb) {
724 	try {
725         map<string, string> variables;
726         variables["[filename]"] = outputdir + util.getSimpleName(treefile);
727         variables["[tag]"] = toString(treeIndex);
728         sumFile = getOutputFileName("wsummary",variables);
729         outputNames.push_back(sumFile);  outputTypes["wsummary"].push_back(sumFile);
730         ofstream outSum; util.openOutputFile(sumFile, outSum);
731 
732 		//column headers
733 		outSum << "Tree#" << '\t' << "Groups" << '\t' << "WScore" << '\t';
734 		m->mothurOut("Tree#\tGroups\tWScore\t");
735 		if (random) { outSum << "WSig"; m->mothurOut("WSig"); }
736 		outSum << endl; m->mothurOutEndLine();
737 
738 		//format output
739 		outSum.setf(ios::fixed, ios::floatfield); outSum.setf(ios::showpoint);
740 
741 		//print each line
742         int precisionLength = itersString.length();
743         for (int j = 0; j < numComp; j++) {
744             if (random) {
745                 if (WScoreSig[j] > (1/(float)iters)) {
746                     outSum << setprecision(6) << treeIndex << '\t' << groupComb[j] << '\t' << utreeScores[j] << '\t' << setprecision(precisionLength) << WScoreSig[j] << endl;
747                     cout << setprecision(6) << treeIndex << '\t' << groupComb[j] << '\t' << utreeScores[j] << '\t' << setprecision(precisionLength) << WScoreSig[j] << endl;
748                     m->mothurOutJustToLog(toString(treeIndex) +"\t" + groupComb[j] +"\t" + toString(utreeScores[j]) +"\t" +  toString(WScoreSig[j]) + "\n");
749                 }else{
750                     outSum << setprecision(6) << treeIndex << '\t' << groupComb[j] << '\t' << utreeScores[j] << '\t' << setprecision(precisionLength) << "<" << (1/float(iters)) << endl;
751                     cout << setprecision(6) << treeIndex << '\t' << groupComb[j] << '\t' << utreeScores[j] << '\t' << setprecision(precisionLength) << "<" << (1/float(iters)) << endl;
752                     m->mothurOutJustToLog(toString(treeIndex) +"\t" + groupComb[j] +"\t" + toString(utreeScores[j]) +"\t<" +  toString((1/float(iters))) + "\n");
753                 }
754             }else{
755                 outSum << setprecision(6) << treeIndex << '\t' << groupComb[j] << '\t' << utreeScores[j] << endl;
756                 cout << setprecision(6) << treeIndex << '\t' << groupComb[j] << '\t' << utreeScores[j]  << endl;
757                 m->mothurOutJustToLog(toString(treeIndex) +"\t" + groupComb[j] +"\t" + toString(utreeScores[j]) +"\n");
758             }
759         }
760 		outSum.close();
761 	}
762 	catch(exception& e) {
763 		m->errorOut(e, "UnifracWeightedCommand", "printWSummaryFile");
764 		exit(1);
765 	}
766 }
767 /***********************************************************/
createPhylipFile(int treeIndex,vector<double> utreeScores)768 void UnifracWeightedCommand::createPhylipFile(int treeIndex, vector<double> utreeScores) {
769 	try {
770 		int count = 0;
771         int numGroups = Groups.size();
772 
773         string phylipFileName;
774         map<string, string> variables;
775         variables["[filename]"] = outputdir + util.getSimpleName(treefile);
776         variables["[tag]"] = toString(treeIndex);
777         if ((outputForm == "lt") || (outputForm == "square")) {
778             variables["[tag2]"] = "weighted.phylip";
779             phylipFileName = getOutputFileName("phylip",variables);
780             outputNames.push_back(phylipFileName); outputTypes["phylip"].push_back(phylipFileName);
781         }else { //column
782             variables["[tag2]"] = "weighted.column";
783             phylipFileName = getOutputFileName("column",variables);
784             outputNames.push_back(phylipFileName); outputTypes["column"].push_back(phylipFileName);
785         }
786 
787         ofstream out; util.openOutputFile(phylipFileName, out);
788 
789         if ((outputForm == "lt") || (outputForm == "square")) { out << numGroups << endl; }
790 
791         //make matrix with scores in it
792         vector< vector<float> > dists;	dists.resize(numGroups);
793         for (int i = 0; i < numGroups; i++) { dists[i].resize(numGroups, 0.0); }
794 
795         //flip it so you can print it
796         for (int r=0; r< numGroups; r++) { for (int l = 0; l < r; l++) { dists[r][l] = utreeScores[count]; dists[l][r] = utreeScores[count]; count++; } }
797 
798         //output to file
799         for (int r=0; r<numGroups; r++) {
800             string name = Groups[r];
801             if (name.length() < 10) { while (name.length() < 10) {  name += " ";  } } //pad with spaces to make compatible
802 
803             if (outputForm == "lt")          { out << name; for (int l = 0; l < r; l++)         {	out << '\t' << dists[r][l];     } out << endl;  }
804             else if (outputForm == "square") { out << name; for (int l = 0; l < numGroups; l++) {	out << '\t' << dists[r][l];     } out << endl;  }
805             else{
806                 for (int l = 0; l < r; l++) {
807                     string otherName = Groups[l];
808                     if (otherName.length() < 10) { while (otherName.length() < 10) {  otherName += " ";  } }  //pad with spaces to make compatible
809 
810                     out  << name << '\t' << otherName  << '\t' << dists[r][l] << endl;
811                 }
812             }
813         }
814         out.close();
815 	}
816 	catch(exception& e) {
817 		m->errorOut(e, "UnifracWeightedCommand", "createPhylipFile");
818 		exit(1);
819 	}
820 }
821 /***********************************************************/
findIndex(float score,int index,vector<vector<double>> & rScores)822 int UnifracWeightedCommand::findIndex(float score, int index, vector< vector<double> >& rScores) {
823 	try{
824         int results = rScores[index].size();
825 
826 		for (int i = 0; i < rScores[index].size(); i++) { if (rScores[index][i] >= score)	{	results = i; 	break; } }
827 
828 		return results;
829 	}
830 	catch(exception& e) {
831 		m->errorOut(e, "UnifracWeightedCommand", "findIndex");
832 		exit(1);
833 	}
834 }
835 /***********************************************************/
calculateFreqsCumuls(set<double> & validScores,vector<vector<double>> rScores,vector<map<double,double>> & rScoreFreq,vector<map<double,double>> & rCumul)836 void UnifracWeightedCommand::calculateFreqsCumuls(set<double>& validScores, vector< vector<double> > rScores, vector< map<double, double> >& rScoreFreq, vector< map<double, double> >& rCumul) {
837 	try {
838 		//clear out old tree values
839 		rScoreFreq.clear(); rScoreFreq.resize(numComp); rCumul.clear(); rCumul.resize(numComp); validScores.clear();
840 
841 		//calculate frequency
842 		for (int f = 0; f < numComp; f++) {
843 			for (int i = 0; i < rScores[f].size(); i++) { //looks like 0,0,1,1,1,2,4,7...  you want to make a map that say rScoreFreq[0] = 2, rScoreFreq[1] = 3...
844 				validScores.insert(rScores[f][i]);
845 				map<double,double>::iterator it = rScoreFreq[f].find(rScores[f][i]);
846 
847 				if (it != rScoreFreq[f].end())  { rScoreFreq[f][rScores[f][i]]++;   }
848 				else                            { rScoreFreq[f][rScores[f][i]] = 1; }
849 			}
850 		}
851 
852 		//calculate rcumul
853 		for(int a = 0; a < numComp; a++) {
854 			float rcumul = 1.0000;
855 			//this loop fills the cumulative maps and put 0.0000 in the score freq map to make it easier to print.
856 			for (set<double>::iterator it = validScores.begin(); it != validScores.end(); it++) {
857 				//make rscoreFreq map and rCumul
858 				map<double,double>::iterator it2 = rScoreFreq[a].find(*it);
859 				rCumul[a][*it] = rcumul;
860 				//get percentage of random trees with that info
861 				if (it2 != rScoreFreq[a].end()) {  rScoreFreq[a][*it] /= iters; rcumul-= it2->second;  }
862 				else { rScoreFreq[a][*it] = 0.0000; } //no random trees with that score
863 			}
864 		}
865 	}
866 	catch(exception& e) {
867 		m->errorOut(e, "UnifracWeightedCommand", "calculateFreqsCumuls");
868 		exit(1);
869 	}
870 }
871 /***********************************************************/
872