1 /*
2    This file is part of the BOLT-LMM linear mixed model software package
3    developed by Po-Ru Loh.  Copyright (C) 2014-2019 Harvard University.
4 
5    This program is free software: you can redistribute it and/or modify
6    it under the terms of the GNU General Public License as published by
7    the Free Software Foundation, either version 3 of the License, or
8    (at your option) any later version.
9 
10    This program is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13    GNU General Public License for more details.
14 
15    You should have received a copy of the GNU General Public License
16    along with this program.  If not, see <http://www.gnu.org/licenses/>.
17 */
18 
19 #include <cstdio>
20 #include <cmath>
21 #include <iostream>
22 #include <vector>
23 #include <utility>
24 #include <algorithm>
25 
26 #include "SnpInfo.hpp"
27 #include "Jackknife.hpp"
28 #include "NumericUtils.hpp"
29 #include "LapackConst.hpp"
30 #include "Types.hpp"
31 #include "MemoryUtils.hpp"
32 #include "LDscoreCalibration.hpp"
33 
34 namespace LDscoreCalibration {
35 
36   using std::vector;
37   using std::pair;
38   using std::make_pair;
39   using std::cout;
40   using std::cerr;
41   using std::endl;
42 
computeMean(const vector<double> & stats,const vector<bool> & maskSnps)43   double computeMean(const vector <double> &stats, const vector <bool> &maskSnps) {
44     double sum = 0; int ctr = 0;
45     for (uint64 m = 0; m < stats.size(); m++)
46       if (maskSnps[m]) {
47 	sum += stats[m];
48 	ctr++;
49       }
50     return sum / ctr;
51   }
52 
computeLambdaGC(const vector<double> & stats,const vector<bool> & maskSnps)53   double computeLambdaGC(const vector <double> &stats, const vector <bool> &maskSnps) {
54     vector <double> goodStats;
55     for (uint64 m = 0; m < stats.size(); m++)
56       if (maskSnps[m])
57 	goodStats.push_back(stats[m]);
58     sort(goodStats.begin(), goodStats.end());
59     return goodStats[goodStats.size()/2] / 0.455;
60   }
61 
removeOutlierWindows(const vector<LMM::SnpInfo> & snps,const vector<double> & stats,vector<bool> maskSnps,double outlierChisqThresh,bool useGenDistWindow)62   vector <bool> removeOutlierWindows
63   (const vector <LMM::SnpInfo> &snps, const vector <double> &stats, vector <bool> maskSnps,
64    double outlierChisqThresh, bool useGenDistWindow) {
65 
66     int M = snps.size();
67     int numGoodSnpsPreOutlier = 0;
68     for (int m = 0; m < M; m++)
69       numGoodSnpsPreOutlier += maskSnps[m];
70     cout << "# of SNPs passing filters before outlier removal: "
71 	 << numGoodSnpsPreOutlier << "/" << M << endl;
72     int numGoodSnps = numGoodSnpsPreOutlier;
73     printf("Masking windows around outlier snps (chisq > %.1f)", outlierChisqThresh);
74     cout << endl;
75 
76     vector < pair <double, int> > statsSortInd(M);
77     for (int m = 0; m < M; m++)
78       statsSortInd[m] = make_pair(stats[m], m);
79     sort(statsSortInd.begin(), statsSortInd.end());
80 
81     for (int s = M-1; s >= 0; s--) { // filter windows around outliers
82       if (statsSortInd[s].first > outlierChisqThresh) {
83 	int m = statsSortInd[s].second;
84 	for (int m2 = m+1; m2 < M; m2++) {
85 	  if (useGenDistWindow) { if (!snps[m].isProximal(snps[m2], OUTLIER_GEN_WINDOW)) break; }
86 	  else { if (!snps[m].isProximal(snps[m2], OUTLIER_PHYS_WINDOW)) break; }
87 	  if (maskSnps[m2]) { maskSnps[m2] = false; numGoodSnps--; }
88 	}
89 	for (int m2 = m-1; m2 >= 0; m2--) {
90 	  if (useGenDistWindow) { if (!snps[m].isProximal(snps[m2], OUTLIER_GEN_WINDOW)) break; }
91 	  else { if (!snps[m].isProximal(snps[m2], OUTLIER_PHYS_WINDOW)) break; }
92 	  if (maskSnps[m2]) { maskSnps[m2] = false; numGoodSnps--; }
93 	}
94 	if (numGoodSnps < numGoodSnpsPreOutlier / 2) {
95 	  cerr << "WARNING: Half of SNPs gone after removing windows near chisq > "
96 	       << statsSortInd[s].first << endl;
97 	  cerr << "         Stopping outlier removal" << endl;
98 	}
99       }
100       else
101 	break;
102     }
103     cout << "# of SNPs remaining after outlier window removal: "
104 	 << numGoodSnps << "/" << numGoodSnpsPreOutlier << endl;
105 
106     return maskSnps;
107   }
108 
computeIntercept(const vector<double> & stats,const vector<double> & LDscores,const vector<double> & LDscoresChip,const vector<bool> & maskSnps,int varianceDegree,double * attenNull1Ptr)109   double computeIntercept
110   (const vector <double> &stats, const vector <double> &LDscores,
111    const vector <double> &LDscoresChip, const vector <bool> &maskSnps,
112    int varianceDegree, double *attenNull1Ptr) {
113 
114     double meanStat = computeMean(stats, maskSnps);
115     double slopeToCM = (meanStat - 1) / computeMean(LDscores, maskSnps);
116 
117     int M = stats.size(), Mfit = 0;
118     for (int m = 0; m < M; m++) Mfit += maskSnps[m];
119     double *weightedStats = ALIGNED_MALLOC_DOUBLES(Mfit);
120     double *weightedRegCols = ALIGNED_MALLOC_DOUBLES(2*Mfit); // sqrt(w) * [all-1s, LDscores]
121 
122     for (int m = 0, mfit = 0; m < M; m++)
123       if (maskSnps[m]) {
124 	// compute weight
125 	double weightDblCount = 1 / std::max(1.0, LDscoresChip[m]);
126 	double weightHeteroskedasticity =
127 	  varianceDegree == 2 ? NumericUtils::sq(1 / std::max(1.0, 1+slopeToCM*LDscores[m])) :
128 	  1 / std::max(2.0, 2 + 0.01*LDscores[m]); // Brendan's approximation
129 
130 	double sqrtWeight = sqrt(weightDblCount * weightHeteroskedasticity);
131 	weightedStats[mfit] = stats[m] * sqrtWeight;           // LHS: stats
132 	weightedRegCols[mfit] = sqrtWeight;                    // RHS: all-1s column
133 	weightedRegCols[Mfit+mfit] = LDscores[m] * sqrtWeight; // RHS: LDscores column
134 	mfit++;
135       }
136 
137     // compute least squares fit
138     double intercept;
139     {
140       char _TRANS = 'N';
141       int _M = Mfit;
142       int _N = 2; // intercept, slope
143       int _NRHS = 1; // fitting one regression (one column of B)
144       double *_A = &weightedRegCols[0];
145       int _LDA = Mfit;
146       double *_B = &weightedStats[0];
147       int _LDB = Mfit;
148       int _LWORK = 4*Mfit; // may not be optimal, but the fitting should be really fast
149       double *_WORK = ALIGNED_MALLOC_DOUBLES(_LWORK);
150       int _INFO;
151       DGELS_MACRO(&_TRANS, &_M, &_N, &_NRHS, _A, &_LDA, _B, &_LDB, _WORK, &_LWORK, &_INFO);
152       intercept = _B[0]; //slope = _B[1];
153       ALIGNED_FREE(_WORK);
154     }
155     ALIGNED_FREE(weightedRegCols);
156     ALIGNED_FREE(weightedStats);
157     if (attenNull1Ptr != NULL) *attenNull1Ptr = (intercept - 1) / (meanStat - 1);
158     return intercept;
159   }
160 
computeIntercepts(const vector<double> & stats,const vector<double> & LDscores,const vector<double> & LDscoresChip,const vector<bool> & maskSnps,int varianceDegree,int jackBlocks,double * attenNull1Ptr,double * attenNull1StdPtr)161   vector <double> computeIntercepts
162   (const vector <double> &stats, const vector <double> &LDscores,
163    const vector <double> &LDscoresChip, const vector <bool> &maskSnps,
164    int varianceDegree, int jackBlocks, double *attenNull1Ptr, double *attenNull1StdPtr) {
165 
166     uint64 M = stats.size();
167     vector <uint64> mfit_to_m;
168     for (uint64 m = 0; m < M; m++)
169       if (maskSnps[m])
170 	mfit_to_m.push_back(m);
171     uint64 Mfit = mfit_to_m.size();
172 
173     vector <double> interceptJacks(jackBlocks+1), attenNull1Jacks(jackBlocks+1);
174     for (uint64 j = 0; j <= (uint64) jackBlocks; j++) {
175       vector <bool> jackMaskSnps = maskSnps;
176       for (uint64 mfit = 0; mfit < Mfit; mfit++) {
177 	if (jackBlocks && mfit*jackBlocks / Mfit == j)
178 	  jackMaskSnps[mfit_to_m[mfit]] = false;
179       }
180       interceptJacks[j] = computeIntercept(stats, LDscores, LDscoresChip, jackMaskSnps,
181 					   varianceDegree, &attenNull1Jacks[j]);
182     }
183     if (attenNull1Ptr != NULL) *attenNull1Ptr = attenNull1Jacks[jackBlocks];
184     if (jackBlocks && attenNull1StdPtr != NULL)
185       *attenNull1StdPtr = Jackknife::stddev(attenNull1Jacks, jackBlocks);
186 
187     return interceptJacks;
188   }
189 
calibrateStatPair(const vector<LMM::SnpInfo> & snps,const vector<double> & statsRef,const vector<double> & statsCur,const vector<double> & LDscores,const vector<double> & LDscoresChip,double minMAF,int N,double outlierVarFracThresh,bool useGenDistWindow,int varianceDegree)190   pair <double, double> calibrateStatPair
191   (const vector <LMM::SnpInfo> &snps, const vector <double> &statsRef,
192    const vector <double> &statsCur, const vector <double> &LDscores,
193    const vector <double> &LDscoresChip, double minMAF, int N, double outlierVarFracThresh,
194    bool useGenDistWindow, int varianceDegree) {
195 
196     int M = snps.size();
197     vector <bool> maskSnps(M);
198     cout << "Filtering to SNPs with chisq stats, LD Scores, and MAF > " << minMAF << endl;
199     for (int m = 0; m < M; m++) // perform initial filtering of snps to use in regression
200       maskSnps[m] =
201 	snps[m].MAF >= minMAF &&      // MAF threshold
202 	statsRef[m] > 0 &&            // ref stat available
203 	statsCur[m] > 0 &&            // cur stat available
204 	!std::isnan(LDscores[m]) &&   // LD Score available
205 	!std::isnan(LDscoresChip[m]); // LD Score weight available
206 
207     // perform outlier removal
208     double outlierChisqThresh = std::max(MIN_OUTLIER_CHISQ_THRESH, N * outlierVarFracThresh);
209     vector <bool> noOutlierMaskSnps = removeOutlierWindows(snps, statsRef, maskSnps,
210 							   outlierChisqThresh, useGenDistWindow);
211 
212     // perform regressions
213     int jackBlocks = SNP_JACKKNIFE_BLOCKS;
214     double attenNull1, attenNull1Std;
215     vector <double> interceptRefJacks
216       = computeIntercepts(statsRef, LDscores, LDscoresChip, noOutlierMaskSnps, varianceDegree,
217 			  jackBlocks, &attenNull1, &attenNull1Std);
218     vector <double> interceptCurJacks
219       = computeIntercepts(statsCur, LDscores, LDscoresChip, noOutlierMaskSnps, varianceDegree,
220 			  jackBlocks, NULL, NULL);
221 #ifdef VERBOSE
222     printf("Intercept of LD Score regression for ref stats:   %.3f (%.3f)\n",
223 	   interceptRefJacks[jackBlocks], Jackknife::stddev(interceptRefJacks, jackBlocks));
224     printf("Estimated attenuation: %.3f (%.3f)\n", attenNull1, attenNull1Std);
225     printf("Intercept of LD Score regression for cur stats: %.3f (%.3f)\n",
226 	   interceptCurJacks[jackBlocks], Jackknife::stddev(interceptCurJacks, jackBlocks));
227 #endif
228 
229     vector <double> calibrationFactorJacks(jackBlocks+1);
230     for (int j = 0; j <= jackBlocks; j++)
231       calibrationFactorJacks[j] = interceptRefJacks[j] / interceptCurJacks[j];
232     pair <double, double> calibrationFactorMeanStd = Jackknife::mean_std(calibrationFactorJacks);
233     printf("Calibration factor (ref/cur) to multiply by:      %.3f (%.3f)\n",
234 	   calibrationFactorMeanStd.first, calibrationFactorMeanStd.second);
235     fflush(stdout);
236 
237     return calibrationFactorMeanStd;
238   }
239 
240   // jackBlocks: 0 for no jackknife
calibrateStat(const vector<LMM::SnpInfo> & snps,const vector<double> & stats,const vector<double> & LDscores,const vector<double> & LDscoresChip,double minMAF,int N,double outlierVarFracThresh,bool useGenDistWindow,int varianceDegree,int jackBlocks)241   RegressionInfo calibrateStat
242   (const vector <LMM::SnpInfo> &snps, const vector <double> &stats,
243    const vector <double> &LDscores, const vector <double> &LDscoresChip, double minMAF, int N,
244    double outlierVarFracThresh, bool useGenDistWindow, int varianceDegree, int jackBlocks) {
245 
246     RegressionInfo info;
247 
248     int M = snps.size();
249     vector <bool> maskSnps(M);
250     for (int m = 0; m < M; m++) maskSnps[m] = stats[m] > 0;
251     info.mean = computeMean(stats, maskSnps); // compute mean of all good stats with no filtering
252     info.lambdaGC = computeLambdaGC(stats, maskSnps);
253 
254     cout << "Filtering to SNPs with chisq stats, LD Scores, and MAF > " << minMAF << endl;
255     for (int m = 0; m < M; m++) // perform initial filtering of snps to use in regression
256       maskSnps[m] =
257 	snps[m].MAF >= minMAF &&      // MAF threshold
258 	stats[m] > 0 &&               // stat available
259 	!std::isnan(LDscores[m]) &&   // LD Score available
260 	!std::isnan(LDscoresChip[m]); // LD Score weight available
261 
262     // perform outlier removal
263     double outlierChisqThresh = std::max(MIN_OUTLIER_CHISQ_THRESH, N * outlierVarFracThresh);
264     vector <bool> noOutlierMaskSnps = removeOutlierWindows(snps, stats, maskSnps,
265 							   outlierChisqThresh, useGenDistWindow);
266     info.noOutlierMean = computeMean(stats, noOutlierMaskSnps);
267 
268     // perform regression
269     vector <double> interceptJacks
270       = computeIntercepts(stats, LDscores, LDscoresChip, noOutlierMaskSnps, varianceDegree,
271 			  jackBlocks, &info.attenNull1, &info.attenNull1Std);
272     info.intercept = interceptJacks[jackBlocks];
273     if (jackBlocks) info.interceptStd = Jackknife::stddev(interceptJacks, jackBlocks);
274 
275     return info;
276   }
277 
278 }
279