1 /* 2 This file is part of the BOLT-LMM linear mixed model software package 3 developed by Po-Ru Loh. Copyright (C) 2014-2019 Harvard University. 4 5 This program is free software: you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation, either version 3 of the License, or 8 (at your option) any later version. 9 10 This program is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 You should have received a copy of the GNU General Public License 16 along with this program. If not, see <http://www.gnu.org/licenses/>. 17 */ 18 19 #include <cstdio> 20 #include <cmath> 21 #include <iostream> 22 #include <vector> 23 #include <utility> 24 #include <algorithm> 25 26 #include "SnpInfo.hpp" 27 #include "Jackknife.hpp" 28 #include "NumericUtils.hpp" 29 #include "LapackConst.hpp" 30 #include "Types.hpp" 31 #include "MemoryUtils.hpp" 32 #include "LDscoreCalibration.hpp" 33 34 namespace LDscoreCalibration { 35 36 using std::vector; 37 using std::pair; 38 using std::make_pair; 39 using std::cout; 40 using std::cerr; 41 using std::endl; 42 computeMean(const vector<double> & stats,const vector<bool> & maskSnps)43 double computeMean(const vector <double> &stats, const vector <bool> &maskSnps) { 44 double sum = 0; int ctr = 0; 45 for (uint64 m = 0; m < stats.size(); m++) 46 if (maskSnps[m]) { 47 sum += stats[m]; 48 ctr++; 49 } 50 return sum / ctr; 51 } 52 computeLambdaGC(const vector<double> & stats,const vector<bool> & maskSnps)53 double computeLambdaGC(const vector <double> &stats, const vector <bool> &maskSnps) { 54 vector <double> goodStats; 55 for (uint64 m = 0; m < stats.size(); m++) 56 if (maskSnps[m]) 57 goodStats.push_back(stats[m]); 58 sort(goodStats.begin(), goodStats.end()); 59 return goodStats[goodStats.size()/2] / 0.455; 60 } 61 removeOutlierWindows(const vector<LMM::SnpInfo> & snps,const vector<double> & stats,vector<bool> maskSnps,double outlierChisqThresh,bool useGenDistWindow)62 vector <bool> removeOutlierWindows 63 (const vector <LMM::SnpInfo> &snps, const vector <double> &stats, vector <bool> maskSnps, 64 double outlierChisqThresh, bool useGenDistWindow) { 65 66 int M = snps.size(); 67 int numGoodSnpsPreOutlier = 0; 68 for (int m = 0; m < M; m++) 69 numGoodSnpsPreOutlier += maskSnps[m]; 70 cout << "# of SNPs passing filters before outlier removal: " 71 << numGoodSnpsPreOutlier << "/" << M << endl; 72 int numGoodSnps = numGoodSnpsPreOutlier; 73 printf("Masking windows around outlier snps (chisq > %.1f)", outlierChisqThresh); 74 cout << endl; 75 76 vector < pair <double, int> > statsSortInd(M); 77 for (int m = 0; m < M; m++) 78 statsSortInd[m] = make_pair(stats[m], m); 79 sort(statsSortInd.begin(), statsSortInd.end()); 80 81 for (int s = M-1; s >= 0; s--) { // filter windows around outliers 82 if (statsSortInd[s].first > outlierChisqThresh) { 83 int m = statsSortInd[s].second; 84 for (int m2 = m+1; m2 < M; m2++) { 85 if (useGenDistWindow) { if (!snps[m].isProximal(snps[m2], OUTLIER_GEN_WINDOW)) break; } 86 else { if (!snps[m].isProximal(snps[m2], OUTLIER_PHYS_WINDOW)) break; } 87 if (maskSnps[m2]) { maskSnps[m2] = false; numGoodSnps--; } 88 } 89 for (int m2 = m-1; m2 >= 0; m2--) { 90 if (useGenDistWindow) { if (!snps[m].isProximal(snps[m2], OUTLIER_GEN_WINDOW)) break; } 91 else { if (!snps[m].isProximal(snps[m2], OUTLIER_PHYS_WINDOW)) break; } 92 if (maskSnps[m2]) { maskSnps[m2] = false; numGoodSnps--; } 93 } 94 if (numGoodSnps < numGoodSnpsPreOutlier / 2) { 95 cerr << "WARNING: Half of SNPs gone after removing windows near chisq > " 96 << statsSortInd[s].first << endl; 97 cerr << " Stopping outlier removal" << endl; 98 } 99 } 100 else 101 break; 102 } 103 cout << "# of SNPs remaining after outlier window removal: " 104 << numGoodSnps << "/" << numGoodSnpsPreOutlier << endl; 105 106 return maskSnps; 107 } 108 computeIntercept(const vector<double> & stats,const vector<double> & LDscores,const vector<double> & LDscoresChip,const vector<bool> & maskSnps,int varianceDegree,double * attenNull1Ptr)109 double computeIntercept 110 (const vector <double> &stats, const vector <double> &LDscores, 111 const vector <double> &LDscoresChip, const vector <bool> &maskSnps, 112 int varianceDegree, double *attenNull1Ptr) { 113 114 double meanStat = computeMean(stats, maskSnps); 115 double slopeToCM = (meanStat - 1) / computeMean(LDscores, maskSnps); 116 117 int M = stats.size(), Mfit = 0; 118 for (int m = 0; m < M; m++) Mfit += maskSnps[m]; 119 double *weightedStats = ALIGNED_MALLOC_DOUBLES(Mfit); 120 double *weightedRegCols = ALIGNED_MALLOC_DOUBLES(2*Mfit); // sqrt(w) * [all-1s, LDscores] 121 122 for (int m = 0, mfit = 0; m < M; m++) 123 if (maskSnps[m]) { 124 // compute weight 125 double weightDblCount = 1 / std::max(1.0, LDscoresChip[m]); 126 double weightHeteroskedasticity = 127 varianceDegree == 2 ? NumericUtils::sq(1 / std::max(1.0, 1+slopeToCM*LDscores[m])) : 128 1 / std::max(2.0, 2 + 0.01*LDscores[m]); // Brendan's approximation 129 130 double sqrtWeight = sqrt(weightDblCount * weightHeteroskedasticity); 131 weightedStats[mfit] = stats[m] * sqrtWeight; // LHS: stats 132 weightedRegCols[mfit] = sqrtWeight; // RHS: all-1s column 133 weightedRegCols[Mfit+mfit] = LDscores[m] * sqrtWeight; // RHS: LDscores column 134 mfit++; 135 } 136 137 // compute least squares fit 138 double intercept; 139 { 140 char _TRANS = 'N'; 141 int _M = Mfit; 142 int _N = 2; // intercept, slope 143 int _NRHS = 1; // fitting one regression (one column of B) 144 double *_A = &weightedRegCols[0]; 145 int _LDA = Mfit; 146 double *_B = &weightedStats[0]; 147 int _LDB = Mfit; 148 int _LWORK = 4*Mfit; // may not be optimal, but the fitting should be really fast 149 double *_WORK = ALIGNED_MALLOC_DOUBLES(_LWORK); 150 int _INFO; 151 DGELS_MACRO(&_TRANS, &_M, &_N, &_NRHS, _A, &_LDA, _B, &_LDB, _WORK, &_LWORK, &_INFO); 152 intercept = _B[0]; //slope = _B[1]; 153 ALIGNED_FREE(_WORK); 154 } 155 ALIGNED_FREE(weightedRegCols); 156 ALIGNED_FREE(weightedStats); 157 if (attenNull1Ptr != NULL) *attenNull1Ptr = (intercept - 1) / (meanStat - 1); 158 return intercept; 159 } 160 computeIntercepts(const vector<double> & stats,const vector<double> & LDscores,const vector<double> & LDscoresChip,const vector<bool> & maskSnps,int varianceDegree,int jackBlocks,double * attenNull1Ptr,double * attenNull1StdPtr)161 vector <double> computeIntercepts 162 (const vector <double> &stats, const vector <double> &LDscores, 163 const vector <double> &LDscoresChip, const vector <bool> &maskSnps, 164 int varianceDegree, int jackBlocks, double *attenNull1Ptr, double *attenNull1StdPtr) { 165 166 uint64 M = stats.size(); 167 vector <uint64> mfit_to_m; 168 for (uint64 m = 0; m < M; m++) 169 if (maskSnps[m]) 170 mfit_to_m.push_back(m); 171 uint64 Mfit = mfit_to_m.size(); 172 173 vector <double> interceptJacks(jackBlocks+1), attenNull1Jacks(jackBlocks+1); 174 for (uint64 j = 0; j <= (uint64) jackBlocks; j++) { 175 vector <bool> jackMaskSnps = maskSnps; 176 for (uint64 mfit = 0; mfit < Mfit; mfit++) { 177 if (jackBlocks && mfit*jackBlocks / Mfit == j) 178 jackMaskSnps[mfit_to_m[mfit]] = false; 179 } 180 interceptJacks[j] = computeIntercept(stats, LDscores, LDscoresChip, jackMaskSnps, 181 varianceDegree, &attenNull1Jacks[j]); 182 } 183 if (attenNull1Ptr != NULL) *attenNull1Ptr = attenNull1Jacks[jackBlocks]; 184 if (jackBlocks && attenNull1StdPtr != NULL) 185 *attenNull1StdPtr = Jackknife::stddev(attenNull1Jacks, jackBlocks); 186 187 return interceptJacks; 188 } 189 calibrateStatPair(const vector<LMM::SnpInfo> & snps,const vector<double> & statsRef,const vector<double> & statsCur,const vector<double> & LDscores,const vector<double> & LDscoresChip,double minMAF,int N,double outlierVarFracThresh,bool useGenDistWindow,int varianceDegree)190 pair <double, double> calibrateStatPair 191 (const vector <LMM::SnpInfo> &snps, const vector <double> &statsRef, 192 const vector <double> &statsCur, const vector <double> &LDscores, 193 const vector <double> &LDscoresChip, double minMAF, int N, double outlierVarFracThresh, 194 bool useGenDistWindow, int varianceDegree) { 195 196 int M = snps.size(); 197 vector <bool> maskSnps(M); 198 cout << "Filtering to SNPs with chisq stats, LD Scores, and MAF > " << minMAF << endl; 199 for (int m = 0; m < M; m++) // perform initial filtering of snps to use in regression 200 maskSnps[m] = 201 snps[m].MAF >= minMAF && // MAF threshold 202 statsRef[m] > 0 && // ref stat available 203 statsCur[m] > 0 && // cur stat available 204 !std::isnan(LDscores[m]) && // LD Score available 205 !std::isnan(LDscoresChip[m]); // LD Score weight available 206 207 // perform outlier removal 208 double outlierChisqThresh = std::max(MIN_OUTLIER_CHISQ_THRESH, N * outlierVarFracThresh); 209 vector <bool> noOutlierMaskSnps = removeOutlierWindows(snps, statsRef, maskSnps, 210 outlierChisqThresh, useGenDistWindow); 211 212 // perform regressions 213 int jackBlocks = SNP_JACKKNIFE_BLOCKS; 214 double attenNull1, attenNull1Std; 215 vector <double> interceptRefJacks 216 = computeIntercepts(statsRef, LDscores, LDscoresChip, noOutlierMaskSnps, varianceDegree, 217 jackBlocks, &attenNull1, &attenNull1Std); 218 vector <double> interceptCurJacks 219 = computeIntercepts(statsCur, LDscores, LDscoresChip, noOutlierMaskSnps, varianceDegree, 220 jackBlocks, NULL, NULL); 221 #ifdef VERBOSE 222 printf("Intercept of LD Score regression for ref stats: %.3f (%.3f)\n", 223 interceptRefJacks[jackBlocks], Jackknife::stddev(interceptRefJacks, jackBlocks)); 224 printf("Estimated attenuation: %.3f (%.3f)\n", attenNull1, attenNull1Std); 225 printf("Intercept of LD Score regression for cur stats: %.3f (%.3f)\n", 226 interceptCurJacks[jackBlocks], Jackknife::stddev(interceptCurJacks, jackBlocks)); 227 #endif 228 229 vector <double> calibrationFactorJacks(jackBlocks+1); 230 for (int j = 0; j <= jackBlocks; j++) 231 calibrationFactorJacks[j] = interceptRefJacks[j] / interceptCurJacks[j]; 232 pair <double, double> calibrationFactorMeanStd = Jackknife::mean_std(calibrationFactorJacks); 233 printf("Calibration factor (ref/cur) to multiply by: %.3f (%.3f)\n", 234 calibrationFactorMeanStd.first, calibrationFactorMeanStd.second); 235 fflush(stdout); 236 237 return calibrationFactorMeanStd; 238 } 239 240 // jackBlocks: 0 for no jackknife calibrateStat(const vector<LMM::SnpInfo> & snps,const vector<double> & stats,const vector<double> & LDscores,const vector<double> & LDscoresChip,double minMAF,int N,double outlierVarFracThresh,bool useGenDistWindow,int varianceDegree,int jackBlocks)241 RegressionInfo calibrateStat 242 (const vector <LMM::SnpInfo> &snps, const vector <double> &stats, 243 const vector <double> &LDscores, const vector <double> &LDscoresChip, double minMAF, int N, 244 double outlierVarFracThresh, bool useGenDistWindow, int varianceDegree, int jackBlocks) { 245 246 RegressionInfo info; 247 248 int M = snps.size(); 249 vector <bool> maskSnps(M); 250 for (int m = 0; m < M; m++) maskSnps[m] = stats[m] > 0; 251 info.mean = computeMean(stats, maskSnps); // compute mean of all good stats with no filtering 252 info.lambdaGC = computeLambdaGC(stats, maskSnps); 253 254 cout << "Filtering to SNPs with chisq stats, LD Scores, and MAF > " << minMAF << endl; 255 for (int m = 0; m < M; m++) // perform initial filtering of snps to use in regression 256 maskSnps[m] = 257 snps[m].MAF >= minMAF && // MAF threshold 258 stats[m] > 0 && // stat available 259 !std::isnan(LDscores[m]) && // LD Score available 260 !std::isnan(LDscoresChip[m]); // LD Score weight available 261 262 // perform outlier removal 263 double outlierChisqThresh = std::max(MIN_OUTLIER_CHISQ_THRESH, N * outlierVarFracThresh); 264 vector <bool> noOutlierMaskSnps = removeOutlierWindows(snps, stats, maskSnps, 265 outlierChisqThresh, useGenDistWindow); 266 info.noOutlierMean = computeMean(stats, noOutlierMaskSnps); 267 268 // perform regression 269 vector <double> interceptJacks 270 = computeIntercepts(stats, LDscores, LDscoresChip, noOutlierMaskSnps, varianceDegree, 271 jackBlocks, &info.attenNull1, &info.attenNull1Std); 272 info.intercept = interceptJacks[jackBlocks]; 273 if (jackBlocks) info.interceptStd = Jackknife::stddev(interceptJacks, jackBlocks); 274 275 return info; 276 } 277 278 } 279