1 /* 2 This file is part of the BOLT-LMM linear mixed model software package 3 developed by Po-Ru Loh. Copyright (C) 2014-2019 Harvard University. 4 5 This program is free software: you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation, either version 3 of the License, or 8 (at your option) any later version. 9 10 This program is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 You should have received a copy of the GNU General Public License 16 along with this program. If not, see <http://www.gnu.org/licenses/>. 17 */ 18 19 #include <cstdio> 20 #include <cstdlib> 21 #include <cstring> 22 #include <iostream> 23 #include <vector> 24 #include <string> 25 #include <utility> 26 #include <algorithm> 27 28 #include "BoltParEstCV.hpp" 29 #include "MemoryUtils.hpp" 30 #include "NumericUtils.hpp" 31 #include "StatsUtils.hpp" 32 #include "Jackknife.hpp" 33 34 namespace LMM { 35 36 using std::vector; 37 using std::string; 38 using std::cout; 39 using std::cerr; 40 using std::endl; 41 ParamData(double _f2,double _p)42 BoltParEstCV::ParamData::ParamData(double _f2, double _p) : f2(_f2), p(_p) {} 43 operator <(const BoltParEstCV::ParamData & paramData2) const44 bool BoltParEstCV::ParamData::operator < (const BoltParEstCV::ParamData ¶mData2) const { 45 return StatsUtils::zScoreDiff(PVEs, paramData2.PVEs) < -2; 46 } 47 BoltParEstCV(const SnpData & _snpData,const DataMatrix & _covarDataT,const double subMaskIndivs[],const vector<std::pair<std::string,DataMatrix::ValueType>> & _covars,int covarMaxLevels,bool _covarUseMissingIndic,int mBlockMultX,int Nautosomes)48 BoltParEstCV::BoltParEstCV 49 (const SnpData& _snpData, const DataMatrix& _covarDataT, const double subMaskIndivs[], 50 const vector < std::pair <std::string, DataMatrix::ValueType> > &_covars, int covarMaxLevels, 51 bool _covarUseMissingIndic, int mBlockMultX, int Nautosomes) 52 : snpData(_snpData), covarDataT(_covarDataT), 53 bolt(_snpData, _covarDataT, subMaskIndivs, _covars, covarMaxLevels, _covarUseMissingIndic, 54 mBlockMultX, Nautosomes), 55 covars(_covars), covarUseMissingIndic(_covarUseMissingIndic) { 56 } 57 58 /** 59 * (f2, p) parameter estimation via cross-validation 60 * - after each fold, compare PVEs of putative (f2, p) param pairs 61 * - eliminate clearly suboptimal param pairs from future folds 62 * - stop when only one param pair left 63 * 64 * return: iterations used in last CV fold 65 */ estMixtureParams(double * f2Est,double * pEst,double * predBoost,const vector<double> & pheno,double logDeltaEst,double sigma2Kest,int CVfoldsSplit,int CVfoldsCompute,bool CVnoEarlyExit,double predBoostMin,bool MCMC,int maxIters,double approxLLtol,int mBlockMultX,int Nautosomes) const66 int BoltParEstCV::estMixtureParams 67 (double *f2Est, double *pEst, double *predBoost, const vector <double> &pheno, 68 double logDeltaEst, double sigma2Kest, int CVfoldsSplit, int CVfoldsCompute, bool CVnoEarlyExit, 69 double predBoostMin, bool MCMC, int maxIters, double approxLLtol, int mBlockMultX, 70 int Nautosomes) const { 71 72 if (CVfoldsCompute <= 0) { 73 const int Nwant = 10000, Nrep = bolt.getNused() / CVfoldsSplit + 1; 74 CVfoldsCompute = std::min(CVfoldsSplit, (Nwant+Nrep-1) / Nrep); 75 cout << "Max CV folds to compute = " << CVfoldsCompute 76 << " (to have > " << Nwant << " samples)" << endl << endl; 77 } 78 if (CVfoldsCompute > CVfoldsSplit) { 79 cerr << "WARNING: CVfoldsCompute > CVfoldsSplit; setting CVfoldsCompute to " << CVfoldsSplit 80 << endl << endl; 81 CVfoldsCompute = CVfoldsSplit; 82 } 83 84 int usedIters = 0; 85 86 // try a fixed set of (f2, p) mixture param pairs 87 const int NUM_F2S = 3; const double f2s[NUM_F2S] = {0.5, 0.3, 0.1}; 88 const int NUM_PS = 6; const double ps[NUM_PS] = {0.5, 0.2, 0.1, 0.05, 0.02, 0.01}; 89 //const int NUM_PS = 9; const double ps[NUM_PS] = {0.5, 0.2, 0.1, 0.05, 0.02, 0.01, 0.005, 0.002, 0.001}; 90 91 // all (f2, p) pairs are in play at the start; this list will be pruned after each fold 92 // important: first pair corresponds to infinitesimal model 93 vector <ParamData> paramDataAll; 94 for (int f2i = 0; f2i < NUM_F2S; f2i++) 95 for (int pi = 0; pi < NUM_PS; pi++) 96 paramDataAll.push_back(ParamData(f2s[f2i], ps[pi])); 97 98 const double *maskIndivs = bolt.getMaskIndivs(); // possibly a subset of snpData.maskIndivs 99 uint64 Nstride = snpData.getNstride(); 100 uint64 M = snpData.getM(); 101 102 // divide indivs into CVfoldsSplit folds 103 vector <int> foldAssignments(Nstride, -1); // -1 for masked indivs 104 int indCtr = 0; 105 for (uint64 n = 0; n < Nstride; n++) 106 if (maskIndivs[n]) 107 foldAssignments[n] = (indCtr++) % CVfoldsSplit; 108 109 double *foldMaskIndivs = ALIGNED_MALLOC_DOUBLES(Nstride); 110 vector <double> baselineMSEs; 111 112 // run OOS pred for each fold in turn 113 for (int fold = 0; fold < CVfoldsCompute; fold++) { 114 115 cout << "====> Starting CV fold " << (fold+1) << " <====" << endl << endl; 116 117 // set up fold assignment mask 118 for (uint64 n = 0; n < Nstride; n++) 119 foldMaskIndivs[n] = (foldAssignments[n] != fold && foldAssignments[n] != -1); 120 121 // create Bolt instance for predicting using non-left-out indivs 122 int foldCovarMaxLevels = 1<<30; // no need to re-check covar max levels 123 const Bolt boltFold(snpData, covarDataT, foldMaskIndivs, covars, foldCovarMaxLevels, 124 covarUseMissingIndic, mBlockMultX, Nautosomes); 125 126 vector <double> PVEs; double baselinePredMSE; 127 { // set up arguments and call Bayes-iter 128 uint64 B = paramDataAll.size(); // number of remaining (f2, p) pairs in play 129 130 double *phenoResidCovCompVecs = ALIGNED_MALLOC_DOUBLES(B*boltFold.getNCstride()); 131 boltFold.maskFillCovCompVecs(phenoResidCovCompVecs, &pheno[0], B); 132 133 double *betasTrans = ALIGNED_MALLOC_DOUBLES(M*B); 134 135 uchar *batchMaskSnps = ALIGNED_MALLOC_UCHARS(M*B); 136 const uchar *projMaskSnpsFold = boltFold.getProjMaskSnps(); 137 for (uint64 m = 0; m < M; m++) 138 memset(batchMaskSnps + m*B, projMaskSnpsFold[m], B*sizeof(batchMaskSnps[0])); 139 140 uint64 MprojMaskFold = boltFold.getMprojMask(); 141 vector <uint64> Ms(B, MprojMaskFold); 142 143 vector <double> logDeltas(B, logDeltaEst); 144 vector <double> sigma2Ks(B, sigma2Kest); 145 146 vector <double> varFrac2Ests(B), pEsts(B); 147 for (uint64 b = 0; b < B; b++) { 148 varFrac2Ests[b] = paramDataAll[b].f2; 149 pEsts[b] = paramDataAll[b].p; 150 } 151 152 // fit the models, one for each (f2, p) pair 153 usedIters = 154 boltFold.batchComputeBayesIter(phenoResidCovCompVecs, betasTrans, batchMaskSnps, 155 &Ms[0], &logDeltas[0], &sigma2Ks[0], &varFrac2Ests[0], 156 &pEsts[0], B, MCMC, maxIters, approxLLtol); 157 158 // reset fold assignment mask to prediction indivs 159 for (uint64 n = 0; n < Nstride; n++) 160 foldMaskIndivs[n] = (foldAssignments[n] == fold); 161 162 // build predictions and compute PVEs 163 PVEs = boltFold.batchComputePredPVEs(&baselinePredMSE, &pheno[0], betasTrans, B, 164 foldMaskIndivs); 165 166 ALIGNED_FREE(batchMaskSnps); 167 ALIGNED_FREE(betasTrans); 168 ALIGNED_FREE(phenoResidCovCompVecs); 169 } 170 171 baselineMSEs.push_back(baselinePredMSE); 172 for (uint64 b = 0; b < paramDataAll.size(); b++) { 173 paramDataAll[b].PVEs.push_back(PVEs[b]); 174 paramDataAll[b].MSEs.push_back(baselinePredMSE * (1-PVEs[b])); 175 } 176 177 #ifdef VERBOSE 178 cout << endl << "Average PVEs obtained by param pairs tested (high to low):" << endl; 179 vector < std::pair <double, string> > avgPVEs; 180 for (uint64 b = 0; b < paramDataAll.size(); b++) { 181 char buf[100]; sprintf(buf, "f2=%g, p=%g", paramDataAll[b].f2, paramDataAll[b].p); 182 avgPVEs.push_back(std::make_pair(NumericUtils::mean(paramDataAll[b].PVEs), string(buf))); 183 std::sort(avgPVEs.begin(), avgPVEs.end(), std::greater < std::pair <double, string> > ()); 184 } 185 if (avgPVEs.size() <= 5) 186 for (uint64 b = 0; b < avgPVEs.size(); b++) 187 printf("%15s: %f\n", avgPVEs[b].second.c_str(), avgPVEs[b].first); 188 else { 189 for (int b = 0; b < 3; b++) 190 printf("%15s: %f\n", avgPVEs[b].second.c_str(), avgPVEs[b].first); 191 printf("%15s\n", "..."); 192 printf("%15s: %f\n", avgPVEs.back().second.c_str(), avgPVEs.back().first); 193 } 194 cout << endl; 195 #endif 196 197 double bestPVE = *std::max_element(PVEs.begin(), PVEs.end()); 198 uint bestInd = std::max_element(PVEs.begin(), PVEs.end())-PVEs.begin(); 199 char bestPars[100]; sprintf(bestPars, "f2=%g, p=%g", paramDataAll[bestInd].f2, 200 paramDataAll[bestInd].p); 201 202 #ifdef VERBOSE 203 cout << "Detailed CV fold results:" << endl; 204 printf(" Absolute prediction MSE baseline (covariates only): %g\n", baselinePredMSE); 205 if (paramDataAll[0].f2 == 0.5 && paramDataAll[0].p == 0.5) 206 printf(" Absolute prediction MSE using standard LMM: %g\n", 207 paramDataAll[0].MSEs.back()); 208 printf(" Absolute prediction MSE, fold-best %14s: %g\n", bestPars, 209 paramDataAll[bestInd].MSEs.back()); 210 for (uint b = 0; b < paramDataAll.size(); b++) { 211 char buf[100]; sprintf(buf, "f2=%g, p=%g", paramDataAll[b].f2, paramDataAll[b].p); 212 printf(" Absolute pred MSE using %15s: %f\n", buf, paramDataAll[b].MSEs.back()); 213 } 214 cout << endl; 215 #endif 216 217 // prune out significantly suboptimal param settings 218 if (!CVnoEarlyExit) 219 for (int b = paramDataAll.size()-1; b >= 0; b--) 220 for (uint64 b2 = 0; b2 < paramDataAll.size(); b2++) 221 if (paramDataAll[b] < paramDataAll[b2]) { 222 paramDataAll.erase(paramDataAll.begin() + b); 223 break; 224 } 225 #ifdef VERBOSE 226 cout << "====> End CV fold " << (fold+1) << ": " << paramDataAll.size() 227 << " remaining param pair(s) <====" << endl << endl; 228 #endif 229 230 if (fold == 0) { // set predBoost: 1 - (smallest MSE) / (inf model MSE) 231 printf("Estimated proportion of variance explained using inf model: %.3f\n", PVEs[0]); 232 *predBoost = 1 - (1-bestPVE) / (1-PVEs[0]); 233 printf("Relative improvement in prediction MSE using non-inf model: %.3f\n\n", *predBoost); 234 if (*predBoost < predBoostMin && !CVnoEarlyExit) { 235 printf("Exiting CV: non-inf model does not substantially improve prediction\n"); 236 break; 237 } 238 } 239 240 // early exit if only one pair left 241 if (paramDataAll.size() == 1) { 242 cout << "Finished cross-validation; params sufficiently constrained after " 243 << (fold+1) << " folds" << endl; 244 break; 245 } 246 } 247 248 if (CVnoEarlyExit) { 249 cout << "*** Combined results across all folds ***" << endl; 250 printf("Baseline MSE: %g\n", NumericUtils::mean(baselineMSEs)); 251 vector < std::pair <double, double> > MSEsToBaselineMeanStds(paramDataAll.size()); 252 for (uint b = 0; b < paramDataAll.size(); b++) 253 MSEsToBaselineMeanStds[b] = 254 Jackknife::ratioOfSumsMeanStd(paramDataAll[b].MSEs, baselineMSEs); 255 uint bestInd = min_element(MSEsToBaselineMeanStds.begin(), MSEsToBaselineMeanStds.end()) 256 - MSEsToBaselineMeanStds.begin(); 257 printf("Pred R^2 and MSE using standard LMM: %6.3f (%.3f) %g\n", 258 1-MSEsToBaselineMeanStds[0].first, MSEsToBaselineMeanStds[0].second, 259 NumericUtils::mean(paramDataAll[0].MSEs)); 260 printf("Pred R^2 and MSE using best non-inf: %6.3f (%.3f) %g\n", 261 1-MSEsToBaselineMeanStds[bestInd].first, MSEsToBaselineMeanStds[bestInd].second, 262 NumericUtils::mean(paramDataAll[bestInd].MSEs)); 263 for (uint b = 0; b < paramDataAll.size(); b++) { 264 char buf[100]; sprintf(buf, "f2=%g, p=%g", paramDataAll[b].f2, paramDataAll[b].p); 265 printf(" Pred R^2 and MSE using %15s: %6.3f (%.3f) %g\n", buf, 266 1-MSEsToBaselineMeanStds[b].first, MSEsToBaselineMeanStds[b].second, 267 NumericUtils::mean(paramDataAll[b].MSEs)); 268 } 269 cout << endl; 270 } 271 272 // find best PVE; store corresponding f2, p in output params 273 double bestMeanPVE = -1e100; 274 for (uint64 b = 0; b < paramDataAll.size(); b++) { 275 double meanPVE = NumericUtils::mean(paramDataAll[b].PVEs); 276 if (meanPVE > bestMeanPVE) { 277 bestMeanPVE = meanPVE; 278 *f2Est = paramDataAll[b].f2; 279 *pEst = paramDataAll[b].p; 280 } 281 } 282 283 cout << "Optimal mixture parameters according to CV: f2 = " << *f2Est 284 << ", p = " << *pEst << endl; 285 286 ALIGNED_FREE(foldMaskIndivs); 287 288 return usedIters; 289 } 290 291 // for use in PhenoBuilder to generate random phenotypes getBoltRef(void) const292 const Bolt &BoltParEstCV::getBoltRef(void) const { 293 return bolt; 294 } 295 } 296