1 /*
2  *  mothurmetastats.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 7/6/11.
6  *  Copyright 2011 Schloss Lab. All rights reserved.
7  *
8  */
9 
10 #include "mothurmetastats.h"
11 #include "mothurfisher.h"
12 #include "utils.hpp"
13 
14 /***********************************************************/
MothurMetastats(double t,int n)15 MothurMetastats::MothurMetastats(double t, int n) {
16 	try {
17 		m = MothurOut::getInstance();
18 		threshold = t;
19 		numPermutations = n;
20 
21 	}catch(exception& e) {
22 		m->errorOut(e, "MothurMetastats", "MothurMetastats");
23 		exit(1);
24 	}
25 }
26 /***********************************************************/
~MothurMetastats()27 MothurMetastats::~MothurMetastats() {}
28 /***********************************************************/
29  //main metastats function
runMetastats(string outputFileName,vector<vector<double>> & data,int secGroupingStart,vector<string> currentLabels,bool fillProps)30 int MothurMetastats::runMetastats(string outputFileName, vector< vector<double> >& data, int secGroupingStart, vector<string> currentLabels, bool fillProps) {
31     try {
32 
33         numOTUs = data.size();		 //numBins
34 		numSamples = data[0].size(); //numGroups in subset
35         secondGroupingStart = secGroupingStart; //g number of samples in group 1
36 
37         vector< vector<double> > Pmatrix; Pmatrix.resize(numOTUs);
38         for (int i = 0; i < numOTUs; i++) { Pmatrix[i].resize(numSamples, 0.0);  } // the relative proportion matrix
39         vector< vector<double> > C1; C1.resize(numOTUs);
40         for (int i = 0; i < numOTUs; i++) { C1[i].resize(3, 0.0);  } // statistic profiles for class1 and class 2
41         vector< vector<double> > C2; C2.resize(numOTUs);            // mean[1], variance[2], standard error[3]
42         for (int i = 0; i < numOTUs; i++) { C2[i].resize(3, 0.0);  }
43         vector<double> T_statistics; T_statistics.resize(numOTUs, 1); // a place to store the true t-statistics
44         vector<double> pvalues; pvalues.resize(numOTUs, 1); // place to store pvalues
45 
46         //*************************************
47         //      convert to proportions
48         //      generate Pmatrix
49         //*************************************
50         vector<double> totals; totals.resize(numSamples, 0); // sum of numSampless / samples -> numSeqs for each sample
51         //total[i] = total abundance for group[i]
52 		for (int i = 0; i < numSamples; i++) { //each sample
53 			for (int j = 0; j < numOTUs; j++) { //each otu
54 				totals[i] += data[j][i];
55 			}
56         }
57 
58         for (int i = 0; i < numSamples; i++) {  //sample
59 			for (int j = 0; j < numOTUs; j++) { //otu
60                 if (fillProps) {  Pmatrix[j][i] = data[j][i]/totals[i]; }
61                 else           {  Pmatrix[j][i] = data[j][i];           }
62 			}
63         }
64 
65         //#********************************************************************************
66         //# ************************** STATISTICAL TESTING ********************************
67         //#********************************************************************************
68 
69         if (numSamples == 2){  //# then we have a two sample comparison
70             //#************************************************************
71             //#  generate p values fisher's exact test
72             //#************************************************************
73             double total1, total2; total1 = 0; total2 = 0;
74 			//total for first grouping
75             for (int i = 0; i < secondGroupingStart; i++) { total1 += totals[i]; }
76 
77             //total for second grouping
78             for (int i = secondGroupingStart; i < numSamples; i++) { total2 += totals[i]; }
79 
80             vector<double> fish;	fish.resize(numOTUs, 0.0);
81 			vector<double> fish2;	fish2.resize(numOTUs, 0.0);
82             //vector<string> currentLabels = m->getCurrentSharedBinLabels();
83 			for(int i = 0; i < numOTUs; i++){ //numBins
84 
85 				for(int j = 0; j < secondGroupingStart; j++)		{ fish[i] += data[i][j];	}
86 				for(int j = secondGroupingStart; j < numSamples; j++)	{ fish2[i] += data[i][j];	}
87 
88 				double f11, f12, f21, f22;
89                 f11 = fish[i];
90                 f12 = fish2[i];
91                 f21 = total1 - fish[i];
92                 f22 = total2 - fish2[i];
93                 if (fillProps) {   f11 = floor(f11);  f12 = floor(f12); f21 = floor(f21); f22 = floor(f22); }
94 
95 				MothurFisher fisher;
96 				double pre = fisher.fexact(f11, f12, f21, f22, currentLabels[i]);
97 				if (pre > 0.999999999)	{ pre = 1.0; }
98 
99 				if (m->getControl_pressed()) { return 1; }
100 
101 				pvalues[i] = pre;
102 			}
103 
104         }else { //we have multiple subjects per population
105 
106             //#*************************************
107             //#  generate statistics mean, var, stderr
108             //#*************************************
109             for(int i = 0; i < numOTUs; i++){ // for each taxa
110                 //# find the mean of each group
111                 double g1Total = 0.0; double g2Total = 0.0;
112                 for (int j = 0; j < secondGroupingStart; j++)       {     g1Total += Pmatrix[i][j]; }
113                 C1[i][0] = g1Total/(double)(secondGroupingStart);
114                 for (int j = secondGroupingStart; j < numSamples; j++)  {     g2Total += Pmatrix[i][j]; }
115                 C2[i][0] = g2Total/(double)(numSamples-secondGroupingStart);
116 
117                  //# find the variance of each group
118                 double g1Var = 0.0; double g2Var = 0.0;
119                 for (int j = 0; j < secondGroupingStart; j++)       {     g1Var += pow((Pmatrix[i][j]-C1[i][0]), 2);  }
120                 C1[i][1] = g1Var/(double)(secondGroupingStart-1);
121                 for (int j = secondGroupingStart; j < numSamples; j++)  {     g2Var += pow((Pmatrix[i][j]-C2[i][0]), 2);  }
122                 C2[i][1] = g2Var/(double)(numSamples-secondGroupingStart-1);
123 
124                 //# find the std error of each group -std err^2 (will change to std err at end)
125                 C1[i][2] = C1[i][1]/(double)(secondGroupingStart);
126                 C2[i][2] = C2[i][1]/(double)(numSamples-secondGroupingStart);
127             }
128 
129             //#*************************************
130             //#  two sample t-statistics
131             //#*************************************
132             for(int i = 0; i < numOTUs; i++){                  // # for each taxa
133                 double xbar_diff = C1[i][0] - C2[i][0];
134                 double denom = sqrt(C1[i][2] + C2[i][2]);
135                 T_statistics[i] = xbar_diff/denom;  // calculate two sample t-statistic
136             }
137 
138             if (m->getDebug()) {
139                 for (int i = 0; i < numOTUs; i++) {
140                     for (int j = 0; j < 3; j++) {
141                         cout << "C1[" << i+1 << "," << j+1 << "]=" << C1[i][j] << ";" << endl;
142                         cout << "C2[" << i+1 << "," << j+1 << "]=" << C2[i][j] << ";" << endl;
143                     }
144                     cout << "T_statistics[" << i+1 << "]=" << T_statistics[i] << ";" << endl;
145                 }
146 
147                 for (int i = 0; i < numOTUs; i++) {
148                     for (int j = 0; j < numSamples; j++) {
149                         cout << "Fmatrix[" << i+1 << "," << j+1 << "]=" << data[i][j] << ";" << endl;
150                     }
151                 }
152             }
153             //#*************************************
154             //# generate initial permuted p-values
155             //#*************************************
156             pvalues = permuted_pvalues(Pmatrix, T_statistics, data);
157 
158             if (m->getDebug()) {   for (int i = 0; i < numOTUs; i++) { m->mothurOut("[DEBUG]: " + currentLabels[i] + " pvalue = " + toString(pvalues[i]) + "\n"); } }
159 
160             //#*************************************
161             //#  generate p values for sparse data
162             //#  using fisher's exact test
163             //#*************************************
164             double total1, total2; total1 = 0; total2 = 0;
165 			//total for first grouping
166             for (int i = 0; i < secondGroupingStart; i++) { total1 += totals[i];  } //total all seqs in first set
167 
168             //total for second grouping
169             for (int i = secondGroupingStart; i < numSamples; i++) { total2 += totals[i];  } //total all seqs in second set
170 
171             vector<double> fish;	fish.resize(numOTUs, 0.0);
172 			vector<double> fish2;	fish2.resize(numOTUs, 0.0);
173 
174 			for(int i = 0; i < numOTUs; i++){ //numBins
175 
176 				for(int j = 0; j < secondGroupingStart; j++)		{ fish[i] += data[i][j];	}
177 				for(int j = secondGroupingStart; j < numSamples; j++)	{ fish2[i] += data[i][j];	}
178 
179                 if ((fish[i] < secondGroupingStart) && (fish2[i] < (numSamples-secondGroupingStart))) {
180 
181                     double f11, f12, f21, f22;
182                     f11 = fish[i];  if (f11 < 0) { f11 *= -1.0; } f11 = floor(f11);
183                     f12 = fish2[i]; if (f12 < 0) { f12 *= -1.0; } f12 = floor(f11);
184                     f21 = total1 - fish[i]; if (f21 < 0) { f21 *= -1.0; } f21 = floor(f21);
185                     f22 = total2 - fish2[i]; if (f22 < 0) { f22 *= -1.0; } f22 = floor(f22);
186                     if (fillProps) {   f11 = floor(f11);  f12 = floor(f12); f21 = floor(f21); f22 = floor(f22); }
187 
188                     MothurFisher fisher;
189                     if (m->getDebug()) {    m->mothurOut("[DEBUG]: about to run fisher for Otu " + currentLabels[i] + " F11, F12, F21, F22 = " + toString(f11) + " " + toString(f12) + " " + toString(f21) + " " + toString(f22) + " " + "\n"); }
190 
191                     double pre = fisher.fexact(f11, f12, f21, f22, currentLabels[i]);
192                     if (m->getDebug()) {  m->mothurOut("[DEBUG]: about to completed fisher for Otu " + currentLabels[i] + " pre = " + toString(pre) + "\n"); }
193 
194                     if (pre > 0.999999999)	{ pre = 1.0; }
195 
196                     if (m->getControl_pressed()) { return 1; }
197 
198                     pvalues[i] = pre;
199                 }
200 			}
201 
202             //#*************************************
203             //#  convert stderr^2 to std error
204             //#*************************************
205             for(int i = 0; i < numOTUs; i++){
206                 C1[i][2] = sqrt(C1[i][2]);
207                 C2[i][2] = sqrt(C2[i][2]);
208             }
209         }
210 
211         // And now we write the files to a text file.
212 		struct tm *local;
213 		time_t t; t = time(NULL);
214 		local = localtime(&t);
215 
216 		ofstream out;
217         Utils util; util.openOutputFile(outputFileName, out);
218 		out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
219 
220 		out << "Local time and date of test: " << asctime(local) << endl;
221 		out << "# numOTUss = " << numOTUs << ", # col = " << numSamples << ", g = " << secondGroupingStart << endl << endl;
222 		out << numPermutations << " permutations" << endl << endl;
223 
224 		//output numSamples headings - not really sure... documentation labels 9 numSampless, there are 10 in the output file
225 		//storage 0 = meanGroup1 - line 529, 1 = varGroup1 - line 532, 2 = err rate1 - line 534, 3 = mean of counts group1?? - line 291, 4 = meanGroup2 - line 536, 5 = varGroup2 - line 539, 6 = err rate2 - line 541, 7 = mean of counts group2?? - line 292, 8 = pvalues - line 293
226 		out << "OTU\tmean(group1)\tvariance(group1)\tstderr(group1)\tmean(group2)\tvariance(group2)\tstderr(group2)\tp-value\n";
227 		for(int i = 0; i < numOTUs; i++){
228 			if (m->getControl_pressed()) { out.close(); return 0; }
229 
230             //if there are binlabels use them otherwise count.
231 			if (i < currentLabels.size()) { out << currentLabels[i] << '\t'; }
232             else { out << (i+1) << '\t'; }
233 
234             out << C1[i][0] << '\t' << C1[i][1] << '\t' << C1[i][2] << '\t' << C2[i][0] << '\t' << C2[i][1] << '\t' << C2[i][2] << '\t' << pvalues[i] <<  endl;
235 
236             //if (pvalues[i] < 0.05) { cout << currentLabels[i] << endl; }
237 		}
238 
239 		out << endl << endl; out.close();
240 
241         return 0;
242 
243     }catch(exception& e) {
244         m->errorOut(e, "MothurMetastats", "runMetastats");
245         exit(1);
246     }
247 }
248 /***********************************************************/
permuted_pvalues(vector<vector<double>> & Imatrix,vector<double> & tstats,vector<vector<double>> & Fmatrix)249 vector<double> MothurMetastats::permuted_pvalues(vector< vector<double> >& Imatrix, vector<double>& tstats, vector< vector<double> >& Fmatrix) {
250 	try {
251         //# matrix stores tstats for each taxa(numOTUs) for each permuted trial(numSamples)
252         vector<double> ps;  ps.resize(numOTUs, 0.0); //# to store the pvalues
253         vector< vector<double> > permuted_ttests; permuted_ttests.resize(numPermutations);
254         for (int i = 0; i < numPermutations; i++) { permuted_ttests[i].resize(numOTUs, 0.0);  }
255 
256         //# calculate null version of tstats using B permutations.
257         for (int i = 0; i < numPermutations; i++) {
258             permuted_ttests[i] = permute_and_calc_ts(Imatrix);
259         }
260 
261         //# calculate each pvalue using the null ts
262         if ((secondGroupingStart) < 8 || (numSamples-secondGroupingStart) < 8){
263             vector< vector<double> > cleanedpermuted_ttests; cleanedpermuted_ttests.resize(numPermutations);  //# the array pooling just the frequently observed ts
264             //# then pool the t's together!
265             //# count how many high freq taxa there are
266             int hfc = 1;
267             for (int i = 0; i < numOTUs; i++) {                 // # for each taxa
268                 double group1Total = 0.0; double group2Total = 0.0;
269                 for(int j = 0; j < secondGroupingStart; j++)		{ group1Total += Fmatrix[i][j];	}
270 				for(int j = secondGroupingStart; j < numSamples; j++)	{ group2Total += Fmatrix[i][j];	}
271 
272                 if (group1Total >= secondGroupingStart || group2Total >= (numSamples-secondGroupingStart)){
273                     hfc++;
274                     for (int j = 0; j < numPermutations; j++) {   cleanedpermuted_ttests[j].push_back(permuted_ttests[j][i]); }
275                 }
276             }
277 
278             //#now for each taxa
279             for (int i = 0; i < numOTUs; i++) {
280                 //number of cleanedpermuted_ttests greater than tstat[i]
281                 int numGreater = 0;
282                 for (int j = 0; j < numPermutations; j++) {
283                     for (int k = 0; k < cleanedpermuted_ttests[j].size(); k++) {
284                         if (cleanedpermuted_ttests[j][k] > abs(tstats[i])) { numGreater++; }
285                     }
286                 }
287 
288                 ps[i] = (1/(double)(numPermutations*hfc))*numGreater;
289             }
290         }else{
291             for (int i = 0; i < numOTUs; i++) {
292                 //number of permuted_ttests[i] greater than tstat[i] //(sum(permuted_ttests[i,] > abs(tstats[i]))+1)
293                 int numGreater = 1;
294                 for (int j = 0; j < numPermutations; j++) { if (permuted_ttests[j][i] > abs(tstats[i])) { numGreater++; }   }
295                 ps[i] = (1/(double)(numPermutations+1))*numGreater;
296             }
297         }
298 
299         return ps;
300 
301     }catch(exception& e) {
302         m->errorOut(e, "MothurMetastats", "permuted_pvalues");
303         exit(1);
304     }
305 }
306 /***********************************************************/
permute_and_calc_ts(vector<vector<double>> & Imatrix)307 vector<double> MothurMetastats::permute_and_calc_ts(vector< vector<double> >& Imatrix) {
308 	try {
309         vector< vector<double> > permutedMatrix = Imatrix;
310 
311         //randomize numSampless, ie group abundances.
312         map<int, int> randomMap;
313         vector<int> randoms;
314         for (int i = 0; i < numSamples; i++) { randoms.push_back(i); }
315         util.mothurRandomShuffle(randoms);
316         for (int i = 0; i < randoms.size(); i++) {   randomMap[i] = randoms[i];   }
317 
318         //calc ts
319         vector< vector<double> > C1; C1.resize(numOTUs);
320         for (int i = 0; i < numOTUs; i++) { C1[i].resize(3, 0.0);  } // statistic profiles for class1 and class 2
321         vector< vector<double> > C2; C2.resize(numOTUs);            // mean[1], variance[2], standard error[3]
322         for (int i = 0; i < numOTUs; i++) { C2[i].resize(3, 0.0);  }
323         vector<double> Ts; Ts.resize(numOTUs, 0.0); // a place to store the true t-statistics
324 
325         //#*************************************
326         //#  generate statistics mean, var, stderr
327         //#*************************************
328         for(int i = 0; i < numOTUs; i++){ // for each taxa
329             //# find the mean of each group
330             double g1Total = 0.0; double g2Total = 0.0;
331             for (int j = 0; j < secondGroupingStart; j++)       {     g1Total += permutedMatrix[i][randomMap[j]]; }
332             C1[i][0] = g1Total/(double)(secondGroupingStart);
333             for (int j = secondGroupingStart; j < numSamples; j++)  {     g2Total += permutedMatrix[i][randomMap[j]]; }
334             C2[i][0] = g2Total/(double)(numSamples-secondGroupingStart);
335 
336             //# find the variance of each group
337             double g1Var = 0.0; double g2Var = 0.0;
338             for (int j = 0; j < secondGroupingStart; j++)       {     g1Var += pow((permutedMatrix[i][randomMap[j]]-C1[i][0]), 2);  }
339             C1[i][1] = g1Var/(double)(secondGroupingStart-1);
340             for (int j = secondGroupingStart; j < numSamples; j++)  {     g2Var += pow((permutedMatrix[i][randomMap[j]]-C2[i][0]), 2);  }
341             C2[i][1] = g2Var/(double)(numSamples-secondGroupingStart-1);
342 
343             //# find the std error of each group -std err^2 (will change to std err at end)
344             C1[i][2] = C1[i][1]/(double)(secondGroupingStart);
345             C2[i][2] = C2[i][1]/(double)(numSamples-secondGroupingStart);
346         }
347 
348 
349 
350         //#*************************************
351         //#  two sample t-statistics
352         //#*************************************
353         for(int i = 0; i < numOTUs; i++){                  // # for each taxa
354             double xbar_diff = C1[i][0] - C2[i][0];
355             double denom = sqrt(C1[i][2] + C2[i][2]);
356             Ts[i] = abs(xbar_diff/denom);  // calculate two sample t-statistic
357         }
358 
359         return Ts;
360 
361 
362     }catch(exception& e) {
363         m->errorOut(e, "MothurMetastats", "permuted_ttests");
364         exit(1);
365     }
366 }
367 /***********************************************************/
OrderPValues(int low,int high,vector<double> & p,vector<int> & order)368 int MothurMetastats::OrderPValues(int low, int high, vector<double>& p, vector<int>& order) {
369 	try {
370 
371 		if (low < high) {
372 			int i = low+1;
373 			int j = high;
374 			int pivot = (low+high) / 2;
375 
376 			swapElements(low, pivot, p, order);  //puts pivot in final spot
377 
378 			/* compare value */
379 			double key = p[low];
380 
381 			/* partition */
382 			while(i <= j) {
383 				/* find member above ... */
384 				while((i <= high) && (p[i] <= key))	{  i++;  }
385 
386 				/* find element below ... */
387 				while((j >= low) && (p[j] > key))	{  j--;  }
388 
389 				if(i < j) {
390 					swapElements(i, j, p, order);
391 				}
392 			}
393 
394 			swapElements(low, j, p, order);
395 
396 			/* recurse */
397 			OrderPValues(low, j-1, p, order);
398 			OrderPValues(j+1, high, p, order);
399 		}
400 
401 		return 0;
402 
403 	}catch(exception& e) {
404 		m->errorOut(e, "MothurMetastats", "OrderPValues");
405 		exit(1);
406 	}
407 }
408 /***********************************************************/
swapElements(int i,int j,vector<double> & p,vector<int> & order)409 int MothurMetastats::swapElements(int i, int j, vector<double>& p, vector<int>& order) {
410 	try {
411 
412 		double z = p[i];
413 		p[i] = p[j];
414 		p[j] = z;
415 
416 		int temp = order[i];
417 		order[i] = order[j];
418 		order[j] = temp;
419 
420 		return 0;
421 
422 	}catch(exception& e) {
423 		m->errorOut(e, "MothurMetastats", "swapElements");
424 		exit(1);
425 	}
426 }
427 
428 /***********************************************************/
getSequence(int start,int end,int length)429 vector<int> MothurMetastats::getSequence(int start, int end, int length) {
430 	try {
431 		vector<int> sequence;
432 		double increment = (end-start) / (double) (length-1);
433 
434 		sequence.push_back(start);
435 		for (int i = 1; i < length-1; i++) {
436 			sequence.push_back(int(i*increment));
437 		}
438 		sequence.push_back(end);
439 
440 		return sequence;
441 
442 	}catch(exception& e) {
443 		m->errorOut(e, "MothurMetastats", "getSequence");
444 		exit(1);
445 	}
446 }
447 /***********************************************************/
448 
449 
450 
451 
452 
453