1 /*
2 * unifracweightedcommand.cpp
3 * Mothur
4 *
5 * Created by Sarah Westcott on 2/9/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7 *
8 */
9
10 #include "unifracweightedcommand.h"
11 #include "consensus.h"
12 #include "subsample.h"
13 #include "treereader.h"
14
15 //**********************************************************************************************************************
setParameters()16 vector<string> UnifracWeightedCommand::setParameters(){
17 try {
18 CommandParameter ptree("tree", "InputTypes", "", "", "none", "none", "none","weighted-wsummary",false,true,true); parameters.push_back(ptree);
19 CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none","",false,false,true); parameters.push_back(pname);
20 CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none","",false,false,true); parameters.push_back(pcount);
21 CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none","",false,false,true); parameters.push_back(pgroup);
22 CommandParameter pgroups("groups", "String", "", "", "", "", "","",false,false); parameters.push_back(pgroups);
23 CommandParameter piters("iters", "Number", "", "1000", "", "", "","",false,false); parameters.push_back(piters);
24 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
25 CommandParameter psubsample("subsample", "String", "", "", "", "", "","",false,false); parameters.push_back(psubsample);
26 CommandParameter pwithreplacement("withreplacement", "Boolean", "", "F", "", "", "","",false,false,true); parameters.push_back(pwithreplacement);
27 CommandParameter pconsensus("consensus", "Boolean", "", "F", "", "", "","tree",false,false); parameters.push_back(pconsensus);
28 CommandParameter prandom("random", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(prandom);
29 CommandParameter pdistance("distance", "Multiple", "column-lt-square-phylip", "column", "", "", "","phylip-column",false,false); parameters.push_back(pdistance);
30 CommandParameter proot("root", "Boolean", "F", "", "", "", "","",false,false); parameters.push_back(proot);
31 CommandParameter pseed("seed", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pseed);
32 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
33 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
34
35 vector<string> tempOutNames;
36 outputTypes["weighted"] = tempOutNames;
37 outputTypes["wsummary"] = tempOutNames;
38 outputTypes["phylip"] = tempOutNames;
39 outputTypes["column"] = tempOutNames;
40 outputTypes["tree"] = tempOutNames;
41
42 abort = false; calledHelp = false;
43
44 vector<string> myArray;
45 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
46 return myArray;
47 }
48 catch(exception& e) {
49 m->errorOut(e, "UnifracWeightedCommand", "setParameters");
50 exit(1);
51 }
52 }
53 //**********************************************************************************************************************
getHelpString()54 string UnifracWeightedCommand::getHelpString(){
55 try {
56 string helpString = "";
57 helpString += "The unifrac.weighted command parameters are tree, group, name, count, groups, iters, distance, processors, root, subsample, consensus and random. tree parameter is required unless you have valid current tree file.\n";
58 helpString += "The groups parameter allows you to specify which of the groups in your groupfile you would like analyzed. You must enter at least 2 valid groups.\n";
59 helpString += "The group names are separated by dashes. The iters parameter allows you to specify how many random trees you would like compared to your tree.\n";
60 helpString += "The distance parameter allows you to create a distance file from the results. The default is false.\n";
61 helpString += "The random parameter allows you to shut off the comparison to random trees. The default is false, meaning don't compare your trees with randomly generated trees.\n";
62 helpString += "The root parameter allows you to include the entire root in your calculations. The default is false, meaning stop at the root for this comparision instead of the root of the entire tree.\n";
63 helpString += "The processors parameter allows you to specify the number of processors to use. The default is 1.\n";
64 helpString += "The subsample parameter allows you to enter the size pergroup of the sample or you can set subsample=T and mothur will use the size of your smallest group. The subsample parameter may only be used with a group file.\n";
65 helpString += "The withreplacement parameter allows you to indicate you want to subsample your data allowing for the same read to be included multiple times. Default=f. \n";
66 helpString += "The consensus parameter allows you to indicate you would like trees built from distance matrices created with the results, as well as a consensus tree built from these trees. Default=F.\n";
67 helpString += "The unifrac.weighted command should be in the following format: unifrac.weighted(groups=yourGroups, iters=yourIters).\n";
68 helpString += "Example unifrac.weighted(groups=A-B-C, iters=500).\n";
69 helpString += "The default value for groups is all the groups in your groupfile, and iters is 1000.\n";
70 helpString += "The unifrac.weighted command output two files: .weighted and .wsummary their descriptions are in the manual.\n";
71
72 return helpString;
73 }
74 catch(exception& e) {
75 m->errorOut(e, "UnifracWeightedCommand", "getHelpString");
76 exit(1);
77 }
78 }
79 //**********************************************************************************************************************
getOutputPattern(string type)80 string UnifracWeightedCommand::getOutputPattern(string type) {
81 try {
82 string pattern = "";
83 if (type == "weighted") { pattern = "[filename],weighted-[filename],[tag],weighted"; }
84 else if (type == "wsummary") { pattern = "[filename],[tag],wsummary"; }
85 else if (type == "phylip") { pattern = "[filename],[tag],[tag2],dist"; }
86 else if (type == "column") { pattern = "[filename],[tag],[tag2],dist"; }
87 else if (type == "tree") { pattern = "[filename],[tag],[tag2],tre"; }
88 else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->setControl_pressed(true); }
89
90 return pattern;
91 }
92 catch(exception& e) {
93 m->errorOut(e, "UnifracWeightedCommand", "getOutputPattern");
94 exit(1);
95 }
96 }
97 /***********************************************************/
UnifracWeightedCommand(string option)98 UnifracWeightedCommand::UnifracWeightedCommand(string option) : Command() {
99 try {
100
101 if(option == "help") { help(); abort = true; calledHelp = true; }
102 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
103 else if(option == "category") { abort = true; calledHelp = true; }
104
105 else {
106 OptionParser parser(option, setParameters());
107 map<string,string> parameters=parser.getParameters();
108
109 ValidParameters validParameter;
110 treefile = validParameter.validFile(parameters, "tree");
111 if (treefile == "not open") { treefile = ""; abort = true; }
112 else if (treefile == "not found") { //if there is a current design file, use it
113 treefile = current->getTreeFile();
114 if (treefile != "") { m->mothurOut("Using " + treefile + " as input file for the tree parameter.\n"); }
115 else { m->mothurOut("You have no current tree file and the tree parameter is required.\n"); abort = true; }
116 }else { current->setTreeFile(treefile); }
117
118 //check for required parameters
119 groupfile = validParameter.validFile(parameters, "group");
120 if (groupfile == "not open") { abort = true; }
121 else if (groupfile == "not found") { groupfile = ""; }
122 else { current->setGroupFile(groupfile); }
123
124 namefile = validParameter.validFile(parameters, "name");
125 if (namefile == "not open") { namefile = ""; abort = true; }
126 else if (namefile == "not found") { namefile = ""; }
127 else { current->setNameFile(namefile); }
128
129 countfile = validParameter.validFile(parameters, "count");
130 if (countfile == "not open") { countfile = ""; abort = true; }
131 else if (countfile == "not found") { countfile = ""; }
132 else { current->setCountFile(countfile); }
133
134 if ((namefile != "") && (countfile != "")) {
135 m->mothurOut("[ERROR]: you may only use one of the following: name or count.\n"); abort = true;
136 }
137
138 if ((groupfile != "") && (countfile != "")) {
139 m->mothurOut("[ERROR]: you may only use one of the following: group or count.\n"); abort=true;
140 }
141
142 if (outputdir == ""){ outputdir = util.hasPath(treefile); }
143
144
145 //check for optional parameter and set defaults
146 // ...at some point should added some additional type checking...
147 groups = validParameter.valid(parameters, "groups");
148 if (groups == "not found") { groups = ""; }
149 else {
150 util.splitAtDash(groups, Groups);
151 if (Groups.size() != 0) { if (Groups[0]== "all") { Groups.clear(); } }
152 }
153
154 itersString = validParameter.valid(parameters, "iters"); if (itersString == "not found") { itersString = "1000"; }
155 util.mothurConvert(itersString, iters);
156
157 string temp = validParameter.valid(parameters, "distance");
158 if (temp == "not found") { phylip = false; outputForm = ""; }
159 else{
160 if (temp=="phylip") { temp = "lt"; }
161 if ((temp == "lt") || (temp == "column") || (temp == "square")) { phylip = true; outputForm = temp; }
162 else { m->mothurOut("Options for distance are: lt, square, or column. Using lt.\n"); phylip = true; outputForm = "lt"; }
163 }
164
165 temp = validParameter.valid(parameters, "random"); if (temp == "not found") { temp = "F"; }
166 random = util.isTrue(temp);
167
168 temp = validParameter.valid(parameters, "root"); if (temp == "not found") { temp = "F"; }
169 includeRoot = util.isTrue(temp);
170
171 temp = validParameter.valid(parameters, "processors"); if (temp == "not found"){ temp = current->getProcessors(); }
172 processors = current->setProcessors(temp);
173
174 temp = validParameter.valid(parameters, "subsample"); if (temp == "not found") { temp = "F"; }
175 if (util.isNumeric1(temp)) { util.mothurConvert(temp, subsampleSize); subsample = true; }
176 else {
177 if (util.isTrue(temp)) { subsample = true; subsampleSize = -1; } //we will set it to smallest group later
178 else { subsample = false; }
179 }
180
181 if (!subsample) { subsampleIters = 0; }
182 else { subsampleIters = iters; }
183
184 temp = validParameter.valid(parameters, "withreplacement"); if (temp == "not found"){ temp = "f"; }
185 withReplacement = util.isTrue(temp);
186
187 temp = validParameter.valid(parameters, "consensus"); if (temp == "not found") { temp = "F"; }
188 consensus = util.isTrue(temp);
189
190 if (subsample && random) { m->mothurOut("[ERROR]: random must be false, if subsample=t.\n"); abort=true; }
191 if (countfile == "") { if (subsample && (groupfile == "")) { m->mothurOut("[ERROR]: if subsample=t, a group file must be provided.\n"); abort=true; } }
192 else {
193 CountTable testCt;
194 if ((!testCt.testGroups(countfile)) && (subsample)) {
195 m->mothurOut("[ERROR]: if subsample=t, a count file with group info must be provided.\n"); abort=true;
196 }
197 }
198 if (subsample && (!phylip)) { phylip=true; outputForm = "lt"; }
199 if (consensus && (!subsample)) { m->mothurOut("[ERROR]: you cannot use consensus without subsample.\n"); abort=true; }
200
201 if (countfile=="") {
202 if (namefile == "") {
203 vector<string> files; files.push_back(treefile);
204 if (!current->getMothurCalling()) { parser.getNameFile(files); }
205 }
206 }
207 }
208 }
209 catch(exception& e) {
210 m->errorOut(e, "UnifracWeightedCommand", "UnifracWeightedCommand");
211 exit(1);
212 }
213 }
214 /***********************************************************/
execute()215 int UnifracWeightedCommand::execute() {
216 try {
217 if (abort) { if (calledHelp) { return 0; } return 2; }
218
219 long start = time(NULL);
220
221 TreeReader* reader;
222 if (countfile == "") { reader = new TreeReader(treefile, groupfile, namefile); }
223 else { reader = new TreeReader(treefile, countfile); }
224 vector<Tree*> T = reader->getTrees();
225 CountTable* ct; ct = T[0]->getCountTable();
226 if ((Groups.size() == 0) || (Groups.size() < 2)) { Groups = ct->getNamesOfGroups(); } //must have at least 2 groups to compare
227 delete reader;
228
229 if (m->getControl_pressed()) { delete ct; for (int i = 0; i < T.size(); i++) { delete T[i]; } return 0; }
230
231 map<string, string> variables;
232 vector<string> nameGroups = ct->getNamesOfGroups();
233 if (Groups.size() < 2) { m->mothurOut("[ERROR]: You cannot run unifrac.weighted with less than 2 groups, aborting.\n"); delete ct; for (int i = 0; i < T.size(); i++) { delete T[i]; } return 0; }
234 if (m->getControl_pressed()) { delete ct; for (int i = 0; i < T.size(); i++) { delete T[i]; } return 0; }
235
236 if ((Groups.size() == 0) || (Groups.size() < 2)) { Groups = ct->getNamesOfGroups(); } //must have at least 2 groups to compare
237
238 //set or check size
239 if (subsample) {
240 //user has not set size, set size = smallest samples size
241 if (subsampleSize == -1) {
242 subsampleSize = ct->getNumSeqsSmallestGroup();
243 m->mothurOut("\nSetting subsample size to " + toString(subsampleSize) + ".\n\n");
244 }else { //eliminate any too small groups
245 vector<string> newGroups = Groups;
246 Groups.clear();
247 for (int i = 0; i < newGroups.size(); i++) {
248 int thisSize = ct->getGroupCount(newGroups[i]);
249
250 if (thisSize >= subsampleSize) { Groups.push_back(newGroups[i]); }
251 else { m->mothurOut("You have selected a size that is larger than "+newGroups[i]+" number of sequences, removing "+newGroups[i]+".\n"); }
252 }
253 }
254 }
255
256 vector<string> groupComb; util.getCombos(groupComb, Groups, numComp); //here in case some groups are removed by subsample
257
258 if (numComp < processors) { processors = numComp; m->mothurOut("Reducing processors to " + toString(numComp) + ".\n"); }
259 if (consensus && (numComp < 2)) { m->mothurOut("consensus can only be used with numComparisions greater than 1, setting consensus=f.\n"); consensus=false; }
260
261 Weighted weighted(includeRoot, Groups);
262
263 for (int i = 0; i < T.size(); i++) {
264
265 if (m->getControl_pressed()) { break; }
266
267 vector<double> WScoreSig; //tree weighted score signifigance when compared to random trees - percentage of random trees with that score or lower.
268 vector< vector<double> > uScores; uScores.resize(numComp); //data[0] = weightedscore AB, data[1] = weightedscore AC...
269 vector<double> userData; userData.resize(numComp,0); //weighted score info for user tree. data[0] = weightedscore AB, data[1] = weightedscore AC...
270 vector<double> randomData; randomData.resize(numComp,0); //weighted score info for random trees. data[0] = weightedscore AB, data[1] = weightedscore AC...
271
272 userData = weighted.getValues(T[i], processors, outputdir); //userData[0] = weightedscore
273 if (m->getControl_pressed()) { break; }
274
275 if (phylip) { createPhylipFile((i+1), userData); }
276 if (random) { runRandomCalcs(T[i], ct, userData, (i+1), WScoreSig, groupComb); }
277 printWSummaryFile((i+1), userData, WScoreSig, groupComb);
278
279 if (m->getControl_pressed()) { break; }
280
281 //subsample loop
282 vector< vector<double> > calcDistsTotals; //each iter, each groupCombos dists. this will be used to make .dist files
283 SubSample sample;
284 for (int thisIter = 0; thisIter < subsampleIters; thisIter++) { //subsampleIters=0, if subsample=f.
285 if (m->getControl_pressed()) { break; }
286
287 //uses method of setting groups to doNotIncludeMe
288 //copy to preserve old one - would do this in subsample but memory cleanup becomes messy.
289 CountTable* newCt = new CountTable();
290 int sampleTime = time(NULL);
291
292 Tree* subSampleTree;
293 if (withReplacement) { subSampleTree = sample.getSampleWithReplacement(T[i], ct, newCt, subsampleSize, Groups); }
294 else { subSampleTree = sample.getSample(T[i], ct, newCt, subsampleSize, Groups); }
295
296 if (m->getDebug()) { m->mothurOut("[DEBUG]: iter " + toString(thisIter) + " took " + toString(time(NULL) - sampleTime) + " seconds to sample tree.\n"); }
297
298 //call new weighted function
299 vector<double> iterData; iterData.resize(numComp,0);
300 Weighted thisWeighted(includeRoot, Groups);
301 iterData = thisWeighted.getValues(subSampleTree, processors, outputdir); //userData[0] = weightedscore
302
303 //save data to make ave dist, std dist
304 calcDistsTotals.push_back(iterData);
305
306 delete newCt; delete subSampleTree;
307
308 if((thisIter+1) % 100 == 0){ m->mothurOutJustToScreen(toString(thisIter+1)+"\n"); }
309 }
310
311 if (m->getControl_pressed()) { break; }
312
313 if (subsample) { getAverageSTDMatrices(calcDistsTotals, i); }
314 if (consensus) { getConsensusTrees(calcDistsTotals, i); }
315 }
316 delete ct; for (int i = 0; i < T.size(); i++) { delete T[i]; }
317
318 if (m->getControl_pressed()) { for (int i = 0; i < outputNames.size(); i++) { util.mothurRemove(outputNames[i]); } return 0; }
319
320 m->mothurOut("It took " + toString(time(NULL) - start) + " secs to run unifrac.weighted.\n");
321
322 //set phylip file as new current phylipfile
323 string currentName = "";
324 itTypes = outputTypes.find("phylip");
325 if (itTypes != outputTypes.end()) {
326 if ((itTypes->second).size() != 0) { currentName = (itTypes->second)[0]; current->setPhylipFile(currentName); }
327 }
328
329 //set column file as new current columnfile
330 itTypes = outputTypes.find("column");
331 if (itTypes != outputTypes.end()) {
332 if ((itTypes->second).size() != 0) { currentName = (itTypes->second)[0]; current->setColumnFile(currentName); }
333 }
334
335 m->mothurOut("\nOutput File Names: \n");
336 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i] +"\n"); } m->mothurOutEndLine();
337
338 return 0;
339 }
340 catch(exception& e) {
341 m->errorOut(e, "UnifracWeightedCommand", "execute");
342 exit(1);
343 }
344 }
345 /**************************************************************************************************/
getAverageSTDMatrices(vector<vector<double>> & dists,int treeNum)346 int UnifracWeightedCommand::getAverageSTDMatrices(vector< vector<double> >& dists, int treeNum) {
347 try {
348 vector<double> averages = util.getAverages(dists);
349 vector<double> stdDev = util.getStandardDeviation(dists, averages);
350
351 int numGroups = Groups.size();
352 vector< vector<double> > avedists;
353 for (int i = 0; i < numGroups; i++) {
354 vector<double> temp; temp.resize(numGroups, 0.0);
355 avedists.push_back(temp);
356 }
357
358 //make matrix with scores in it
359 vector< vector<double> > stddists; //stddists.resize(m->getNumGroups());
360 for (int i = 0; i < numGroups; i++) {
361 vector<double> temp; for (int j = 0; j < numGroups; j++) { temp.push_back(0.0); }
362 stddists.push_back(temp);
363 }
364
365 //flip it so you can print it
366 int count = 0;
367 for (int r=0; r< numGroups; r++) {
368 for (int l = 0; l < r; l++) {
369 avedists[r][l] = averages[count];
370 avedists[l][r] = averages[count];
371 stddists[r][l] = stdDev[count];
372 stddists[l][r] = stdDev[count];
373 count++;
374 }
375 }
376
377 map<string, string> variables;
378 variables["[filename]"] = outputdir + util.getSimpleName(treefile);
379 variables["[tag]"] = toString(treeNum+1);
380 variables["[tag2]"] = "weighted.ave";
381 string aveFileName = getOutputFileName("phylip",variables);
382 if (outputForm != "column") { outputNames.push_back(aveFileName); outputTypes["phylip"].push_back(aveFileName); }
383 else { outputNames.push_back(aveFileName); outputTypes["column"].push_back(aveFileName); }
384 ofstream out; util.openOutputFile(aveFileName, out);
385
386 variables["[tag2]"] = "weighted.std";
387 string stdFileName = getOutputFileName("phylip",variables);
388 if (outputForm != "column") { outputNames.push_back(stdFileName); outputTypes["phylip"].push_back(stdFileName); }
389 else { outputNames.push_back(stdFileName); outputTypes["column"].push_back(stdFileName); }
390 ofstream outStd; util.openOutputFile(stdFileName, outStd);
391
392 if ((outputForm == "lt") || (outputForm == "square")) {
393 //output numSeqs
394 out << numGroups << endl;
395 outStd << numGroups << endl;
396 }
397
398 //output to file
399 for (int r=0; r< numGroups; r++) {
400 string name = Groups[r];
401 if (name.length() < 10) { while (name.length() < 10) { name += " "; } } //pad with spaces to make compatible
402
403 if (outputForm == "lt") {
404 out << name; outStd << name;
405 for (int l = 0; l < r; l++) { out << '\t' << avedists[r][l]; outStd << '\t' << stddists[r][l];} //output distances
406 out << endl; outStd << endl;
407 }else if (outputForm == "square") {
408 out << name; outStd << name;
409 for (int l = 0; l < numGroups; l++) { out << '\t' << avedists[r][l]; outStd << '\t' << stddists[r][l]; } //output distances
410 out << endl; outStd << endl;
411 }else{
412 for (int l = 0; l < r; l++) {
413 string otherName = Groups[l];
414 if (otherName.length() < 10) { while (otherName.length() < 10) { otherName += " "; } } //pad with spaces to make compatible
415
416 out << name << '\t' << otherName << '\t' << avedists[r][l] << endl; //output distances
417 outStd << name << '\t' << otherName << '\t' << stddists[r][l] << endl;
418 }
419 }
420 }
421 out.close(); outStd.close();
422
423 return 0;
424 }
425 catch(exception& e) {
426 m->errorOut(e, "UnifracWeightedCommand", "getAverageSTDMatrices");
427 exit(1);
428 }
429 }
430 /**************************************************************************************************/
getConsensusTrees(vector<vector<double>> & dists,int treeNum)431 int UnifracWeightedCommand::getConsensusTrees(vector< vector<double> >& dists, int treeNum) {
432 try {
433 ///create treemap class from groupmap for tree class to use
434 CountTable newCt;
435 set<string> nameMap;
436 map<string, string> groupMap;
437 set<string> gps;
438 int numGroups = Groups.size();
439 for (int i = 0; i < numGroups; i++) {
440 nameMap.insert(Groups[i]);
441 gps.insert(Groups[i]);
442 groupMap[Groups[i]] = Groups[i];
443 }
444 newCt.createTable(nameMap, groupMap, gps);
445
446 vector<Tree*> newTrees = buildTrees(dists, treeNum, newCt); //also creates .all.tre file containing the trees created
447
448 if (m->getControl_pressed()) { return 0; }
449
450 Consensus con;
451 Tree* conTree = con.getTree(newTrees);
452
453 //create a new filename
454 map<string, string> variables;
455 variables["[filename]"] = outputdir + util.getRootName(util.getSimpleName(treefile));
456 variables["[tag]"] = toString(treeNum+1);
457 variables["[tag2]"] = "weighted.cons";
458 string conFile = getOutputFileName("tree",variables);
459 outputNames.push_back(conFile); outputTypes["tree"].push_back(conFile);
460 ofstream outTree;
461 util.openOutputFile(conFile, outTree);
462
463 if (conTree != NULL) { conTree->print(outTree, "boot"); delete conTree; } outTree.close();
464
465 return 0;
466 }
467 catch(exception& e) {
468 m->errorOut(e, "UnifracWeightedCommand", "getConsensusTrees");
469 exit(1);
470 }
471 }
472 /**************************************************************************************************/
buildTrees(vector<vector<double>> & dists,int treeNum,CountTable & myct)473 vector<Tree*> UnifracWeightedCommand::buildTrees(vector< vector<double> >& dists, int treeNum, CountTable& myct) {
474 try {
475 vector<Tree*> trees;
476
477 //create a new filename
478 map<string, string> variables;
479 variables["[filename]"] = outputdir + util.getRootName(util.getSimpleName(treefile));
480 variables["[tag]"] = toString(treeNum+1);
481 variables["[tag2]"] = "weighted.all";
482 string outputFile = getOutputFileName("tree",variables);
483 outputNames.push_back(outputFile); outputTypes["tree"].push_back(outputFile);
484
485 ofstream outAll;
486 util.openOutputFile(outputFile, outAll);
487 int numGroups = Groups.size();
488
489 for (int i = 0; i < dists.size(); i++) { //dists[0] are the dists for the first subsampled tree.
490
491 if (m->getControl_pressed()) { break; }
492
493 //make matrix with scores in it
494 vector< vector<double> > sims; sims.resize(numGroups);
495 for (int j = 0; j < numGroups; j++) { sims[j].resize(numGroups, 0.0); }
496
497 int count = 0;
498 for (int r=0; r<numGroups; r++) {
499 for (int l = 0; l < r; l++) {
500 double sim = -(dists[i][count]-1.0);
501 sims[r][l] = sim;
502 sims[l][r] = sim;
503 count++;
504 }
505 }
506
507 //create tree
508 Tree* tempTree = new Tree(&myct, sims, Groups);
509 tempTree->assembleTree();
510 trees.push_back(tempTree);
511 tempTree->print(outAll); //print tree
512 }
513 outAll.close();
514
515 if (m->getControl_pressed()) { for (int i = 0; i < trees.size(); i++) { delete trees[i]; trees[i] = NULL; } util.mothurRemove(outputFile); }
516
517 return trees;
518 }
519 catch(exception& e) {
520 m->errorOut(e, "UnifracWeightedCommand", "buildTrees");
521 exit(1);
522 }
523 }
524 /**************************************************************************************************/
runRandomCalcs(Tree * thisTree,CountTable * ct,vector<double> usersScores,int iter,vector<double> & WScoreSig,vector<string> groupComb)525 int UnifracWeightedCommand::runRandomCalcs(Tree* thisTree, CountTable* ct, vector<double> usersScores, int iter, vector<double>& WScoreSig, vector<string> groupComb) {
526 try {
527 map<string, string> variables;
528 variables["[filename]"] = outputdir + util.getSimpleName(treefile);
529 variables["[tag]"] = toString(iter);
530 string wFileName = getOutputFileName("weighted", variables);
531 ColumnFile output(wFileName, itersString); ofstream out; util.openOutputFile(wFileName, out); out.close();
532 outputNames.push_back(wFileName); outputTypes["weighted"].push_back(wFileName);
533
534 //calculate number of comparisons i.e. with groups A,B,C = AB, AC, BC = 3;
535 vector< vector<string> > namesOfGroupCombos;
536 int numGroups = Groups.size();
537 for (int a=0; a<numGroups; a++) {
538 for (int l = 0; l < a; l++) {
539 vector<string> groups; groups.push_back(Groups[a]); groups.push_back(Groups[l]);
540 namesOfGroupCombos.push_back(groups);
541 }
542 }
543
544 vector<vector<int> > randomTreeNodes;
545 for (int f = 0; f < numComp; f++) { randomTreeNodes.push_back(thisTree->getNodes(namesOfGroupCombos[f])); }
546 vector<vector<int> > savedRandomTreeNodes = randomTreeNodes;
547
548 //get scores for random trees
549 vector<vector<double> > rScores; rScores.resize(numComp);
550 for (int i = 0; i < iters; i++) {
551 if (m->getControl_pressed()) { return 0; }
552 randomTreeNodes = savedRandomTreeNodes;
553
554 for (int f = 0; f < numComp; f++) { util.mothurRandomShuffle(randomTreeNodes[f]); }
555
556 vector<double> thisItersRScores = createProcesses(thisTree, ct, namesOfGroupCombos, randomTreeNodes);
557
558 for (int f = 0; f < numComp; f++) { rScores[f].push_back(thisItersRScores[f]); }
559
560 if((i+1) % 100 == 0){ m->mothurOut(toString(i+1)+"\n"); }
561 }
562
563 //find the signifigance of the score for summary file
564 for (int f = 0; f < numComp; f++) {
565 //sort random scores
566 sort(rScores[f].begin(), rScores[f].end());
567
568 //the index of the score higher than yours is returned
569 //so if you have 1000 random trees the index returned is 100
570 //then there are 900 trees with a score greater then you.
571 //giving you a signifigance of 0.900
572 int index = findIndex(usersScores[f], f, rScores); if (index == -1) { m->mothurOut("error in UnifracWeightedCommand\n"); exit(1); } //error code
573
574 //the signifigance is the number of trees with the users score or higher
575 WScoreSig.push_back((iters-index)/(float)iters);
576 }
577
578 set<double> validScores; //map contains scores from random
579 vector< map<double, double> > rScoreFreq; //map <weighted score, number of random trees with that score.> -vector entry for each combination.
580 vector< map<double, double> > rCumul; //map <weighted score, cumulative percentage of number of random trees with that score or higher.> -vector entry for each c
581 calculateFreqsCumuls(validScores, rScores, rScoreFreq, rCumul);
582
583 vector<string> tags; tags.push_back("Score"); tags.push_back("RandFreq"); tags.push_back("RandCumul");
584 for(int a = 0; a < numComp; a++) {
585 output.setLabelName(groupComb[a], tags);
586 //print each line
587 for (set<double>::iterator it = validScores.begin(); it != validScores.end(); it++) {
588 vector<double> data; data.push_back(*it); data.push_back(rScoreFreq[a][*it]); data.push_back(rCumul[a][*it]);
589 output.updateOutput(data);
590 }
591 output.resetFile();
592 }
593 return 0;
594 }
595 catch(exception& e) {
596 m->errorOut(e, "UnifracWeightedCommand", "runRandomCalcs");
597 exit(1);
598 }
599 }
600 /***********************************************************************/
601 struct weightedRandomData {
602 bool includeRoot;
603 int count, numComps, start, num;
604 vector<string> Groups, Treenames;
605 vector<double> scores;
606 vector< vector<string> > namesOfGroupCombos;
607 vector<vector<int> > randomizedTreeNodes;
608 MothurOut* m;
609 Tree* t;
610 CountTable* ct;
611 Utils util;
612
weightedRandomDataweightedRandomData613 weightedRandomData(){}
weightedRandomDataweightedRandomData614 weightedRandomData(int st, int en, vector< vector<string> > ngc, Tree* tree, CountTable* count, bool ir, vector<string> g, vector<vector<int> > randomTreeNodes) {
615 m = MothurOut::getInstance();
616 num = en;
617 start = st;
618 namesOfGroupCombos = ngc;
619 numComps = namesOfGroupCombos.size();
620 randomizedTreeNodes = randomTreeNodes;
621 t = tree;
622 ct = count;
623 includeRoot = ir;
624 Groups = g;
625 Treenames = t->getTreeNames();
626 count = 0;
627 }
628 };
629 /**************************************************************************************************/
driverWeightedRandom(weightedRandomData * params)630 void driverWeightedRandom(weightedRandomData* params) {
631 try {
632 Weighted weighted(params->includeRoot, params->Groups);
633
634 params->count = 0;
635
636 Tree* randT = new Tree(params->ct, params->Treenames);
637
638 for (int h = params->start; h < (params->start+params->num); h++) {
639
640 if (params->m->getControl_pressed()) { break; }
641
642 string groupA = params->namesOfGroupCombos[h][0];
643 string groupB = params->namesOfGroupCombos[h][1];
644 vector<int> treeNodesFromTheseGroups = params->randomizedTreeNodes[h];
645
646 //copy T[i]'s info.
647 randT->getCopy(params->t);
648
649 //create a random tree with same topology as T[i], but different labels
650 randT->assembleRandomUnifracTree(params->randomizedTreeNodes[h]);
651
652 if (params->m->getControl_pressed()) { break; }
653
654 //get wscore of random tree
655 EstOutput randomData = weighted.getValues(randT, groupA, groupB);
656
657 if (params->m->getControl_pressed()) { break; }
658
659 //save scores
660 params->scores.push_back(randomData[0]);
661 }
662
663 delete randT;
664 }
665 catch(exception& e) {
666 params->m->errorOut(e, "UnifracWeightedCommand", "driver");
667 exit(1);
668 }
669 }
670 /**************************************************************************************************/
createProcesses(Tree * t,CountTable * ct,vector<vector<string>> namesOfGroupCombos,vector<vector<int>> & randomizedTreeNodes)671 vector<double> UnifracWeightedCommand::createProcesses(Tree* t, CountTable* ct, vector< vector<string> > namesOfGroupCombos, vector<vector<int> >& randomizedTreeNodes) {
672 try {
673 //breakdown work between processors
674 vector<linePair> lines;
675 int remainingPairs = namesOfGroupCombos.size();
676 if (remainingPairs < processors) { processors = remainingPairs; }
677 int startIndex = 0;
678 for (int remainingProcessors = processors; remainingProcessors > 0; remainingProcessors--) {
679 int numPairs = remainingPairs; //case for last processor
680 if (remainingProcessors != 1) { numPairs = ceil(remainingPairs / remainingProcessors); }
681 lines.push_back(linePair(startIndex, numPairs)); //startIndex, numPairs
682 startIndex = startIndex + numPairs;
683 remainingPairs = remainingPairs - numPairs;
684 }
685 //create array of worker threads
686 vector<std::thread*> workerThreads;
687 vector<weightedRandomData*> data;
688
689 //Lauch worker threads
690 for (int i = 0; i < processors-1; i++) {
691 CountTable* copyCount = new CountTable();
692 copyCount->copy(ct);
693 vector<string> Treenames = t->getTreeNames();
694 Tree* copyTree = new Tree(copyCount, Treenames);
695 copyTree->getCopy(t);
696
697 weightedRandomData* dataBundle = new weightedRandomData(lines[i+1].start, lines[i+1].end, namesOfGroupCombos, copyTree, copyCount, includeRoot, Groups, randomizedTreeNodes);
698 data.push_back(dataBundle);
699 workerThreads.push_back(new std::thread(driverWeightedRandom, dataBundle));
700 }
701
702 weightedRandomData* dataBundle = new weightedRandomData(lines[0].start, lines[0].end, namesOfGroupCombos, t, ct, includeRoot, Groups, randomizedTreeNodes);
703 driverWeightedRandom(dataBundle);
704 vector<double> scores = dataBundle->scores;
705
706
707 for (int i = 0; i < processors-1; i++) {
708 workerThreads[i]->join();
709
710 scores.insert(scores.end(), data[i]->scores.begin(), data[i]->scores.end());
711
712 delete data[i]->t; delete data[i]->ct; delete data[i]; delete workerThreads[i];
713 }
714 delete dataBundle;
715 return scores;
716 }
717 catch(exception& e) {
718 m->errorOut(e, "UnifracWeightedCommand", "createProcesses");
719 exit(1);
720 }
721 }
722 /***********************************************************/
printWSummaryFile(int treeIndex,vector<double> utreeScores,vector<double> WScoreSig,vector<string> groupComb)723 void UnifracWeightedCommand::printWSummaryFile(int treeIndex, vector<double> utreeScores, vector<double> WScoreSig, vector<string> groupComb) {
724 try {
725 map<string, string> variables;
726 variables["[filename]"] = outputdir + util.getSimpleName(treefile);
727 variables["[tag]"] = toString(treeIndex);
728 sumFile = getOutputFileName("wsummary",variables);
729 outputNames.push_back(sumFile); outputTypes["wsummary"].push_back(sumFile);
730 ofstream outSum; util.openOutputFile(sumFile, outSum);
731
732 //column headers
733 outSum << "Tree#" << '\t' << "Groups" << '\t' << "WScore" << '\t';
734 m->mothurOut("Tree#\tGroups\tWScore\t");
735 if (random) { outSum << "WSig"; m->mothurOut("WSig"); }
736 outSum << endl; m->mothurOutEndLine();
737
738 //format output
739 outSum.setf(ios::fixed, ios::floatfield); outSum.setf(ios::showpoint);
740
741 //print each line
742 int precisionLength = itersString.length();
743 for (int j = 0; j < numComp; j++) {
744 if (random) {
745 if (WScoreSig[j] > (1/(float)iters)) {
746 outSum << setprecision(6) << treeIndex << '\t' << groupComb[j] << '\t' << utreeScores[j] << '\t' << setprecision(precisionLength) << WScoreSig[j] << endl;
747 cout << setprecision(6) << treeIndex << '\t' << groupComb[j] << '\t' << utreeScores[j] << '\t' << setprecision(precisionLength) << WScoreSig[j] << endl;
748 m->mothurOutJustToLog(toString(treeIndex) +"\t" + groupComb[j] +"\t" + toString(utreeScores[j]) +"\t" + toString(WScoreSig[j]) + "\n");
749 }else{
750 outSum << setprecision(6) << treeIndex << '\t' << groupComb[j] << '\t' << utreeScores[j] << '\t' << setprecision(precisionLength) << "<" << (1/float(iters)) << endl;
751 cout << setprecision(6) << treeIndex << '\t' << groupComb[j] << '\t' << utreeScores[j] << '\t' << setprecision(precisionLength) << "<" << (1/float(iters)) << endl;
752 m->mothurOutJustToLog(toString(treeIndex) +"\t" + groupComb[j] +"\t" + toString(utreeScores[j]) +"\t<" + toString((1/float(iters))) + "\n");
753 }
754 }else{
755 outSum << setprecision(6) << treeIndex << '\t' << groupComb[j] << '\t' << utreeScores[j] << endl;
756 cout << setprecision(6) << treeIndex << '\t' << groupComb[j] << '\t' << utreeScores[j] << endl;
757 m->mothurOutJustToLog(toString(treeIndex) +"\t" + groupComb[j] +"\t" + toString(utreeScores[j]) +"\n");
758 }
759 }
760 outSum.close();
761 }
762 catch(exception& e) {
763 m->errorOut(e, "UnifracWeightedCommand", "printWSummaryFile");
764 exit(1);
765 }
766 }
767 /***********************************************************/
createPhylipFile(int treeIndex,vector<double> utreeScores)768 void UnifracWeightedCommand::createPhylipFile(int treeIndex, vector<double> utreeScores) {
769 try {
770 int count = 0;
771 int numGroups = Groups.size();
772
773 string phylipFileName;
774 map<string, string> variables;
775 variables["[filename]"] = outputdir + util.getSimpleName(treefile);
776 variables["[tag]"] = toString(treeIndex);
777 if ((outputForm == "lt") || (outputForm == "square")) {
778 variables["[tag2]"] = "weighted.phylip";
779 phylipFileName = getOutputFileName("phylip",variables);
780 outputNames.push_back(phylipFileName); outputTypes["phylip"].push_back(phylipFileName);
781 }else { //column
782 variables["[tag2]"] = "weighted.column";
783 phylipFileName = getOutputFileName("column",variables);
784 outputNames.push_back(phylipFileName); outputTypes["column"].push_back(phylipFileName);
785 }
786
787 ofstream out; util.openOutputFile(phylipFileName, out);
788
789 if ((outputForm == "lt") || (outputForm == "square")) { out << numGroups << endl; }
790
791 //make matrix with scores in it
792 vector< vector<float> > dists; dists.resize(numGroups);
793 for (int i = 0; i < numGroups; i++) { dists[i].resize(numGroups, 0.0); }
794
795 //flip it so you can print it
796 for (int r=0; r< numGroups; r++) { for (int l = 0; l < r; l++) { dists[r][l] = utreeScores[count]; dists[l][r] = utreeScores[count]; count++; } }
797
798 //output to file
799 for (int r=0; r<numGroups; r++) {
800 string name = Groups[r];
801 if (name.length() < 10) { while (name.length() < 10) { name += " "; } } //pad with spaces to make compatible
802
803 if (outputForm == "lt") { out << name; for (int l = 0; l < r; l++) { out << '\t' << dists[r][l]; } out << endl; }
804 else if (outputForm == "square") { out << name; for (int l = 0; l < numGroups; l++) { out << '\t' << dists[r][l]; } out << endl; }
805 else{
806 for (int l = 0; l < r; l++) {
807 string otherName = Groups[l];
808 if (otherName.length() < 10) { while (otherName.length() < 10) { otherName += " "; } } //pad with spaces to make compatible
809
810 out << name << '\t' << otherName << '\t' << dists[r][l] << endl;
811 }
812 }
813 }
814 out.close();
815 }
816 catch(exception& e) {
817 m->errorOut(e, "UnifracWeightedCommand", "createPhylipFile");
818 exit(1);
819 }
820 }
821 /***********************************************************/
findIndex(float score,int index,vector<vector<double>> & rScores)822 int UnifracWeightedCommand::findIndex(float score, int index, vector< vector<double> >& rScores) {
823 try{
824 int results = rScores[index].size();
825
826 for (int i = 0; i < rScores[index].size(); i++) { if (rScores[index][i] >= score) { results = i; break; } }
827
828 return results;
829 }
830 catch(exception& e) {
831 m->errorOut(e, "UnifracWeightedCommand", "findIndex");
832 exit(1);
833 }
834 }
835 /***********************************************************/
calculateFreqsCumuls(set<double> & validScores,vector<vector<double>> rScores,vector<map<double,double>> & rScoreFreq,vector<map<double,double>> & rCumul)836 void UnifracWeightedCommand::calculateFreqsCumuls(set<double>& validScores, vector< vector<double> > rScores, vector< map<double, double> >& rScoreFreq, vector< map<double, double> >& rCumul) {
837 try {
838 //clear out old tree values
839 rScoreFreq.clear(); rScoreFreq.resize(numComp); rCumul.clear(); rCumul.resize(numComp); validScores.clear();
840
841 //calculate frequency
842 for (int f = 0; f < numComp; f++) {
843 for (int i = 0; i < rScores[f].size(); i++) { //looks like 0,0,1,1,1,2,4,7... you want to make a map that say rScoreFreq[0] = 2, rScoreFreq[1] = 3...
844 validScores.insert(rScores[f][i]);
845 map<double,double>::iterator it = rScoreFreq[f].find(rScores[f][i]);
846
847 if (it != rScoreFreq[f].end()) { rScoreFreq[f][rScores[f][i]]++; }
848 else { rScoreFreq[f][rScores[f][i]] = 1; }
849 }
850 }
851
852 //calculate rcumul
853 for(int a = 0; a < numComp; a++) {
854 float rcumul = 1.0000;
855 //this loop fills the cumulative maps and put 0.0000 in the score freq map to make it easier to print.
856 for (set<double>::iterator it = validScores.begin(); it != validScores.end(); it++) {
857 //make rscoreFreq map and rCumul
858 map<double,double>::iterator it2 = rScoreFreq[a].find(*it);
859 rCumul[a][*it] = rcumul;
860 //get percentage of random trees with that info
861 if (it2 != rScoreFreq[a].end()) { rScoreFreq[a][*it] /= iters; rcumul-= it2->second; }
862 else { rScoreFreq[a][*it] = 0.0000; } //no random trees with that score
863 }
864 }
865 }
866 catch(exception& e) {
867 m->errorOut(e, "UnifracWeightedCommand", "calculateFreqsCumuls");
868 exit(1);
869 }
870 }
871 /***********************************************************/
872