1 /*
2 * mothurmetastats.cpp
3 * Mothur
4 *
5 * Created by westcott on 7/6/11.
6 * Copyright 2011 Schloss Lab. All rights reserved.
7 *
8 */
9
10 #include "mothurmetastats.h"
11 #include "mothurfisher.h"
12 #include "utils.hpp"
13
14 /***********************************************************/
MothurMetastats(double t,int n)15 MothurMetastats::MothurMetastats(double t, int n) {
16 try {
17 m = MothurOut::getInstance();
18 threshold = t;
19 numPermutations = n;
20
21 }catch(exception& e) {
22 m->errorOut(e, "MothurMetastats", "MothurMetastats");
23 exit(1);
24 }
25 }
26 /***********************************************************/
~MothurMetastats()27 MothurMetastats::~MothurMetastats() {}
28 /***********************************************************/
29 //main metastats function
runMetastats(string outputFileName,vector<vector<double>> & data,int secGroupingStart,vector<string> currentLabels,bool fillProps)30 int MothurMetastats::runMetastats(string outputFileName, vector< vector<double> >& data, int secGroupingStart, vector<string> currentLabels, bool fillProps) {
31 try {
32
33 numOTUs = data.size(); //numBins
34 numSamples = data[0].size(); //numGroups in subset
35 secondGroupingStart = secGroupingStart; //g number of samples in group 1
36
37 vector< vector<double> > Pmatrix; Pmatrix.resize(numOTUs);
38 for (int i = 0; i < numOTUs; i++) { Pmatrix[i].resize(numSamples, 0.0); } // the relative proportion matrix
39 vector< vector<double> > C1; C1.resize(numOTUs);
40 for (int i = 0; i < numOTUs; i++) { C1[i].resize(3, 0.0); } // statistic profiles for class1 and class 2
41 vector< vector<double> > C2; C2.resize(numOTUs); // mean[1], variance[2], standard error[3]
42 for (int i = 0; i < numOTUs; i++) { C2[i].resize(3, 0.0); }
43 vector<double> T_statistics; T_statistics.resize(numOTUs, 1); // a place to store the true t-statistics
44 vector<double> pvalues; pvalues.resize(numOTUs, 1); // place to store pvalues
45
46 //*************************************
47 // convert to proportions
48 // generate Pmatrix
49 //*************************************
50 vector<double> totals; totals.resize(numSamples, 0); // sum of numSampless / samples -> numSeqs for each sample
51 //total[i] = total abundance for group[i]
52 for (int i = 0; i < numSamples; i++) { //each sample
53 for (int j = 0; j < numOTUs; j++) { //each otu
54 totals[i] += data[j][i];
55 }
56 }
57
58 for (int i = 0; i < numSamples; i++) { //sample
59 for (int j = 0; j < numOTUs; j++) { //otu
60 if (fillProps) { Pmatrix[j][i] = data[j][i]/totals[i]; }
61 else { Pmatrix[j][i] = data[j][i]; }
62 }
63 }
64
65 //#********************************************************************************
66 //# ************************** STATISTICAL TESTING ********************************
67 //#********************************************************************************
68
69 if (numSamples == 2){ //# then we have a two sample comparison
70 //#************************************************************
71 //# generate p values fisher's exact test
72 //#************************************************************
73 double total1, total2; total1 = 0; total2 = 0;
74 //total for first grouping
75 for (int i = 0; i < secondGroupingStart; i++) { total1 += totals[i]; }
76
77 //total for second grouping
78 for (int i = secondGroupingStart; i < numSamples; i++) { total2 += totals[i]; }
79
80 vector<double> fish; fish.resize(numOTUs, 0.0);
81 vector<double> fish2; fish2.resize(numOTUs, 0.0);
82 //vector<string> currentLabels = m->getCurrentSharedBinLabels();
83 for(int i = 0; i < numOTUs; i++){ //numBins
84
85 for(int j = 0; j < secondGroupingStart; j++) { fish[i] += data[i][j]; }
86 for(int j = secondGroupingStart; j < numSamples; j++) { fish2[i] += data[i][j]; }
87
88 double f11, f12, f21, f22;
89 f11 = fish[i];
90 f12 = fish2[i];
91 f21 = total1 - fish[i];
92 f22 = total2 - fish2[i];
93 if (fillProps) { f11 = floor(f11); f12 = floor(f12); f21 = floor(f21); f22 = floor(f22); }
94
95 MothurFisher fisher;
96 double pre = fisher.fexact(f11, f12, f21, f22, currentLabels[i]);
97 if (pre > 0.999999999) { pre = 1.0; }
98
99 if (m->getControl_pressed()) { return 1; }
100
101 pvalues[i] = pre;
102 }
103
104 }else { //we have multiple subjects per population
105
106 //#*************************************
107 //# generate statistics mean, var, stderr
108 //#*************************************
109 for(int i = 0; i < numOTUs; i++){ // for each taxa
110 //# find the mean of each group
111 double g1Total = 0.0; double g2Total = 0.0;
112 for (int j = 0; j < secondGroupingStart; j++) { g1Total += Pmatrix[i][j]; }
113 C1[i][0] = g1Total/(double)(secondGroupingStart);
114 for (int j = secondGroupingStart; j < numSamples; j++) { g2Total += Pmatrix[i][j]; }
115 C2[i][0] = g2Total/(double)(numSamples-secondGroupingStart);
116
117 //# find the variance of each group
118 double g1Var = 0.0; double g2Var = 0.0;
119 for (int j = 0; j < secondGroupingStart; j++) { g1Var += pow((Pmatrix[i][j]-C1[i][0]), 2); }
120 C1[i][1] = g1Var/(double)(secondGroupingStart-1);
121 for (int j = secondGroupingStart; j < numSamples; j++) { g2Var += pow((Pmatrix[i][j]-C2[i][0]), 2); }
122 C2[i][1] = g2Var/(double)(numSamples-secondGroupingStart-1);
123
124 //# find the std error of each group -std err^2 (will change to std err at end)
125 C1[i][2] = C1[i][1]/(double)(secondGroupingStart);
126 C2[i][2] = C2[i][1]/(double)(numSamples-secondGroupingStart);
127 }
128
129 //#*************************************
130 //# two sample t-statistics
131 //#*************************************
132 for(int i = 0; i < numOTUs; i++){ // # for each taxa
133 double xbar_diff = C1[i][0] - C2[i][0];
134 double denom = sqrt(C1[i][2] + C2[i][2]);
135 T_statistics[i] = xbar_diff/denom; // calculate two sample t-statistic
136 }
137
138 if (m->getDebug()) {
139 for (int i = 0; i < numOTUs; i++) {
140 for (int j = 0; j < 3; j++) {
141 cout << "C1[" << i+1 << "," << j+1 << "]=" << C1[i][j] << ";" << endl;
142 cout << "C2[" << i+1 << "," << j+1 << "]=" << C2[i][j] << ";" << endl;
143 }
144 cout << "T_statistics[" << i+1 << "]=" << T_statistics[i] << ";" << endl;
145 }
146
147 for (int i = 0; i < numOTUs; i++) {
148 for (int j = 0; j < numSamples; j++) {
149 cout << "Fmatrix[" << i+1 << "," << j+1 << "]=" << data[i][j] << ";" << endl;
150 }
151 }
152 }
153 //#*************************************
154 //# generate initial permuted p-values
155 //#*************************************
156 pvalues = permuted_pvalues(Pmatrix, T_statistics, data);
157
158 if (m->getDebug()) { for (int i = 0; i < numOTUs; i++) { m->mothurOut("[DEBUG]: " + currentLabels[i] + " pvalue = " + toString(pvalues[i]) + "\n"); } }
159
160 //#*************************************
161 //# generate p values for sparse data
162 //# using fisher's exact test
163 //#*************************************
164 double total1, total2; total1 = 0; total2 = 0;
165 //total for first grouping
166 for (int i = 0; i < secondGroupingStart; i++) { total1 += totals[i]; } //total all seqs in first set
167
168 //total for second grouping
169 for (int i = secondGroupingStart; i < numSamples; i++) { total2 += totals[i]; } //total all seqs in second set
170
171 vector<double> fish; fish.resize(numOTUs, 0.0);
172 vector<double> fish2; fish2.resize(numOTUs, 0.0);
173
174 for(int i = 0; i < numOTUs; i++){ //numBins
175
176 for(int j = 0; j < secondGroupingStart; j++) { fish[i] += data[i][j]; }
177 for(int j = secondGroupingStart; j < numSamples; j++) { fish2[i] += data[i][j]; }
178
179 if ((fish[i] < secondGroupingStart) && (fish2[i] < (numSamples-secondGroupingStart))) {
180
181 double f11, f12, f21, f22;
182 f11 = fish[i]; if (f11 < 0) { f11 *= -1.0; } f11 = floor(f11);
183 f12 = fish2[i]; if (f12 < 0) { f12 *= -1.0; } f12 = floor(f11);
184 f21 = total1 - fish[i]; if (f21 < 0) { f21 *= -1.0; } f21 = floor(f21);
185 f22 = total2 - fish2[i]; if (f22 < 0) { f22 *= -1.0; } f22 = floor(f22);
186 if (fillProps) { f11 = floor(f11); f12 = floor(f12); f21 = floor(f21); f22 = floor(f22); }
187
188 MothurFisher fisher;
189 if (m->getDebug()) { m->mothurOut("[DEBUG]: about to run fisher for Otu " + currentLabels[i] + " F11, F12, F21, F22 = " + toString(f11) + " " + toString(f12) + " " + toString(f21) + " " + toString(f22) + " " + "\n"); }
190
191 double pre = fisher.fexact(f11, f12, f21, f22, currentLabels[i]);
192 if (m->getDebug()) { m->mothurOut("[DEBUG]: about to completed fisher for Otu " + currentLabels[i] + " pre = " + toString(pre) + "\n"); }
193
194 if (pre > 0.999999999) { pre = 1.0; }
195
196 if (m->getControl_pressed()) { return 1; }
197
198 pvalues[i] = pre;
199 }
200 }
201
202 //#*************************************
203 //# convert stderr^2 to std error
204 //#*************************************
205 for(int i = 0; i < numOTUs; i++){
206 C1[i][2] = sqrt(C1[i][2]);
207 C2[i][2] = sqrt(C2[i][2]);
208 }
209 }
210
211 // And now we write the files to a text file.
212 struct tm *local;
213 time_t t; t = time(NULL);
214 local = localtime(&t);
215
216 ofstream out;
217 Utils util; util.openOutputFile(outputFileName, out);
218 out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
219
220 out << "Local time and date of test: " << asctime(local) << endl;
221 out << "# numOTUss = " << numOTUs << ", # col = " << numSamples << ", g = " << secondGroupingStart << endl << endl;
222 out << numPermutations << " permutations" << endl << endl;
223
224 //output numSamples headings - not really sure... documentation labels 9 numSampless, there are 10 in the output file
225 //storage 0 = meanGroup1 - line 529, 1 = varGroup1 - line 532, 2 = err rate1 - line 534, 3 = mean of counts group1?? - line 291, 4 = meanGroup2 - line 536, 5 = varGroup2 - line 539, 6 = err rate2 - line 541, 7 = mean of counts group2?? - line 292, 8 = pvalues - line 293
226 out << "OTU\tmean(group1)\tvariance(group1)\tstderr(group1)\tmean(group2)\tvariance(group2)\tstderr(group2)\tp-value\n";
227 for(int i = 0; i < numOTUs; i++){
228 if (m->getControl_pressed()) { out.close(); return 0; }
229
230 //if there are binlabels use them otherwise count.
231 if (i < currentLabels.size()) { out << currentLabels[i] << '\t'; }
232 else { out << (i+1) << '\t'; }
233
234 out << C1[i][0] << '\t' << C1[i][1] << '\t' << C1[i][2] << '\t' << C2[i][0] << '\t' << C2[i][1] << '\t' << C2[i][2] << '\t' << pvalues[i] << endl;
235
236 //if (pvalues[i] < 0.05) { cout << currentLabels[i] << endl; }
237 }
238
239 out << endl << endl; out.close();
240
241 return 0;
242
243 }catch(exception& e) {
244 m->errorOut(e, "MothurMetastats", "runMetastats");
245 exit(1);
246 }
247 }
248 /***********************************************************/
permuted_pvalues(vector<vector<double>> & Imatrix,vector<double> & tstats,vector<vector<double>> & Fmatrix)249 vector<double> MothurMetastats::permuted_pvalues(vector< vector<double> >& Imatrix, vector<double>& tstats, vector< vector<double> >& Fmatrix) {
250 try {
251 //# matrix stores tstats for each taxa(numOTUs) for each permuted trial(numSamples)
252 vector<double> ps; ps.resize(numOTUs, 0.0); //# to store the pvalues
253 vector< vector<double> > permuted_ttests; permuted_ttests.resize(numPermutations);
254 for (int i = 0; i < numPermutations; i++) { permuted_ttests[i].resize(numOTUs, 0.0); }
255
256 //# calculate null version of tstats using B permutations.
257 for (int i = 0; i < numPermutations; i++) {
258 permuted_ttests[i] = permute_and_calc_ts(Imatrix);
259 }
260
261 //# calculate each pvalue using the null ts
262 if ((secondGroupingStart) < 8 || (numSamples-secondGroupingStart) < 8){
263 vector< vector<double> > cleanedpermuted_ttests; cleanedpermuted_ttests.resize(numPermutations); //# the array pooling just the frequently observed ts
264 //# then pool the t's together!
265 //# count how many high freq taxa there are
266 int hfc = 1;
267 for (int i = 0; i < numOTUs; i++) { // # for each taxa
268 double group1Total = 0.0; double group2Total = 0.0;
269 for(int j = 0; j < secondGroupingStart; j++) { group1Total += Fmatrix[i][j]; }
270 for(int j = secondGroupingStart; j < numSamples; j++) { group2Total += Fmatrix[i][j]; }
271
272 if (group1Total >= secondGroupingStart || group2Total >= (numSamples-secondGroupingStart)){
273 hfc++;
274 for (int j = 0; j < numPermutations; j++) { cleanedpermuted_ttests[j].push_back(permuted_ttests[j][i]); }
275 }
276 }
277
278 //#now for each taxa
279 for (int i = 0; i < numOTUs; i++) {
280 //number of cleanedpermuted_ttests greater than tstat[i]
281 int numGreater = 0;
282 for (int j = 0; j < numPermutations; j++) {
283 for (int k = 0; k < cleanedpermuted_ttests[j].size(); k++) {
284 if (cleanedpermuted_ttests[j][k] > abs(tstats[i])) { numGreater++; }
285 }
286 }
287
288 ps[i] = (1/(double)(numPermutations*hfc))*numGreater;
289 }
290 }else{
291 for (int i = 0; i < numOTUs; i++) {
292 //number of permuted_ttests[i] greater than tstat[i] //(sum(permuted_ttests[i,] > abs(tstats[i]))+1)
293 int numGreater = 1;
294 for (int j = 0; j < numPermutations; j++) { if (permuted_ttests[j][i] > abs(tstats[i])) { numGreater++; } }
295 ps[i] = (1/(double)(numPermutations+1))*numGreater;
296 }
297 }
298
299 return ps;
300
301 }catch(exception& e) {
302 m->errorOut(e, "MothurMetastats", "permuted_pvalues");
303 exit(1);
304 }
305 }
306 /***********************************************************/
permute_and_calc_ts(vector<vector<double>> & Imatrix)307 vector<double> MothurMetastats::permute_and_calc_ts(vector< vector<double> >& Imatrix) {
308 try {
309 vector< vector<double> > permutedMatrix = Imatrix;
310
311 //randomize numSampless, ie group abundances.
312 map<int, int> randomMap;
313 vector<int> randoms;
314 for (int i = 0; i < numSamples; i++) { randoms.push_back(i); }
315 util.mothurRandomShuffle(randoms);
316 for (int i = 0; i < randoms.size(); i++) { randomMap[i] = randoms[i]; }
317
318 //calc ts
319 vector< vector<double> > C1; C1.resize(numOTUs);
320 for (int i = 0; i < numOTUs; i++) { C1[i].resize(3, 0.0); } // statistic profiles for class1 and class 2
321 vector< vector<double> > C2; C2.resize(numOTUs); // mean[1], variance[2], standard error[3]
322 for (int i = 0; i < numOTUs; i++) { C2[i].resize(3, 0.0); }
323 vector<double> Ts; Ts.resize(numOTUs, 0.0); // a place to store the true t-statistics
324
325 //#*************************************
326 //# generate statistics mean, var, stderr
327 //#*************************************
328 for(int i = 0; i < numOTUs; i++){ // for each taxa
329 //# find the mean of each group
330 double g1Total = 0.0; double g2Total = 0.0;
331 for (int j = 0; j < secondGroupingStart; j++) { g1Total += permutedMatrix[i][randomMap[j]]; }
332 C1[i][0] = g1Total/(double)(secondGroupingStart);
333 for (int j = secondGroupingStart; j < numSamples; j++) { g2Total += permutedMatrix[i][randomMap[j]]; }
334 C2[i][0] = g2Total/(double)(numSamples-secondGroupingStart);
335
336 //# find the variance of each group
337 double g1Var = 0.0; double g2Var = 0.0;
338 for (int j = 0; j < secondGroupingStart; j++) { g1Var += pow((permutedMatrix[i][randomMap[j]]-C1[i][0]), 2); }
339 C1[i][1] = g1Var/(double)(secondGroupingStart-1);
340 for (int j = secondGroupingStart; j < numSamples; j++) { g2Var += pow((permutedMatrix[i][randomMap[j]]-C2[i][0]), 2); }
341 C2[i][1] = g2Var/(double)(numSamples-secondGroupingStart-1);
342
343 //# find the std error of each group -std err^2 (will change to std err at end)
344 C1[i][2] = C1[i][1]/(double)(secondGroupingStart);
345 C2[i][2] = C2[i][1]/(double)(numSamples-secondGroupingStart);
346 }
347
348
349
350 //#*************************************
351 //# two sample t-statistics
352 //#*************************************
353 for(int i = 0; i < numOTUs; i++){ // # for each taxa
354 double xbar_diff = C1[i][0] - C2[i][0];
355 double denom = sqrt(C1[i][2] + C2[i][2]);
356 Ts[i] = abs(xbar_diff/denom); // calculate two sample t-statistic
357 }
358
359 return Ts;
360
361
362 }catch(exception& e) {
363 m->errorOut(e, "MothurMetastats", "permuted_ttests");
364 exit(1);
365 }
366 }
367 /***********************************************************/
OrderPValues(int low,int high,vector<double> & p,vector<int> & order)368 int MothurMetastats::OrderPValues(int low, int high, vector<double>& p, vector<int>& order) {
369 try {
370
371 if (low < high) {
372 int i = low+1;
373 int j = high;
374 int pivot = (low+high) / 2;
375
376 swapElements(low, pivot, p, order); //puts pivot in final spot
377
378 /* compare value */
379 double key = p[low];
380
381 /* partition */
382 while(i <= j) {
383 /* find member above ... */
384 while((i <= high) && (p[i] <= key)) { i++; }
385
386 /* find element below ... */
387 while((j >= low) && (p[j] > key)) { j--; }
388
389 if(i < j) {
390 swapElements(i, j, p, order);
391 }
392 }
393
394 swapElements(low, j, p, order);
395
396 /* recurse */
397 OrderPValues(low, j-1, p, order);
398 OrderPValues(j+1, high, p, order);
399 }
400
401 return 0;
402
403 }catch(exception& e) {
404 m->errorOut(e, "MothurMetastats", "OrderPValues");
405 exit(1);
406 }
407 }
408 /***********************************************************/
swapElements(int i,int j,vector<double> & p,vector<int> & order)409 int MothurMetastats::swapElements(int i, int j, vector<double>& p, vector<int>& order) {
410 try {
411
412 double z = p[i];
413 p[i] = p[j];
414 p[j] = z;
415
416 int temp = order[i];
417 order[i] = order[j];
418 order[j] = temp;
419
420 return 0;
421
422 }catch(exception& e) {
423 m->errorOut(e, "MothurMetastats", "swapElements");
424 exit(1);
425 }
426 }
427
428 /***********************************************************/
getSequence(int start,int end,int length)429 vector<int> MothurMetastats::getSequence(int start, int end, int length) {
430 try {
431 vector<int> sequence;
432 double increment = (end-start) / (double) (length-1);
433
434 sequence.push_back(start);
435 for (int i = 1; i < length-1; i++) {
436 sequence.push_back(int(i*increment));
437 }
438 sequence.push_back(end);
439
440 return sequence;
441
442 }catch(exception& e) {
443 m->errorOut(e, "MothurMetastats", "getSequence");
444 exit(1);
445 }
446 }
447 /***********************************************************/
448
449
450
451
452
453