1 /*
2    This file is part of the BOLT-LMM linear mixed model software package
3    developed by Po-Ru Loh.  Copyright (C) 2014-2019 Harvard University.
4 
5    This program is free software: you can redistribute it and/or modify
6    it under the terms of the GNU General Public License as published by
7    the Free Software Foundation, either version 3 of the License, or
8    (at your option) any later version.
9 
10    This program is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13    GNU General Public License for more details.
14 
15    You should have received a copy of the GNU General Public License
16    along with this program.  If not, see <http://www.gnu.org/licenses/>.
17 */
18 
19 #include <cstdio>
20 #include <cstdlib>
21 #include <cstring>
22 #include <iostream>
23 #include <vector>
24 #include <string>
25 #include <utility>
26 #include <algorithm>
27 
28 #include "BoltParEstCV.hpp"
29 #include "MemoryUtils.hpp"
30 #include "NumericUtils.hpp"
31 #include "StatsUtils.hpp"
32 #include "Jackknife.hpp"
33 
34 namespace LMM {
35 
36   using std::vector;
37   using std::string;
38   using std::cout;
39   using std::cerr;
40   using std::endl;
41 
ParamData(double _f2,double _p)42   BoltParEstCV::ParamData::ParamData(double _f2, double _p) : f2(_f2), p(_p) {}
43 
operator <(const BoltParEstCV::ParamData & paramData2) const44   bool BoltParEstCV::ParamData::operator < (const BoltParEstCV::ParamData &paramData2) const {
45     return StatsUtils::zScoreDiff(PVEs, paramData2.PVEs) < -2;
46   }
47 
BoltParEstCV(const SnpData & _snpData,const DataMatrix & _covarDataT,const double subMaskIndivs[],const vector<std::pair<std::string,DataMatrix::ValueType>> & _covars,int covarMaxLevels,bool _covarUseMissingIndic,int mBlockMultX,int Nautosomes)48   BoltParEstCV::BoltParEstCV
49   (const SnpData& _snpData, const DataMatrix& _covarDataT, const double subMaskIndivs[],
50    const vector < std::pair <std::string, DataMatrix::ValueType> > &_covars, int covarMaxLevels,
51    bool _covarUseMissingIndic, int mBlockMultX, int Nautosomes)
52     : snpData(_snpData), covarDataT(_covarDataT),
53       bolt(_snpData, _covarDataT, subMaskIndivs, _covars, covarMaxLevels, _covarUseMissingIndic,
54 	   mBlockMultX, Nautosomes),
55       covars(_covars), covarUseMissingIndic(_covarUseMissingIndic) {
56   }
57 
58   /**
59    * (f2, p) parameter estimation via cross-validation
60    * - after each fold, compare PVEs of putative (f2, p) param pairs
61    * - eliminate clearly suboptimal param pairs from future folds
62    * - stop when only one param pair left
63    *
64    * return: iterations used in last CV fold
65    */
estMixtureParams(double * f2Est,double * pEst,double * predBoost,const vector<double> & pheno,double logDeltaEst,double sigma2Kest,int CVfoldsSplit,int CVfoldsCompute,bool CVnoEarlyExit,double predBoostMin,bool MCMC,int maxIters,double approxLLtol,int mBlockMultX,int Nautosomes) const66   int BoltParEstCV::estMixtureParams
67   (double *f2Est, double *pEst, double *predBoost, const vector <double> &pheno,
68    double logDeltaEst, double sigma2Kest, int CVfoldsSplit, int CVfoldsCompute, bool CVnoEarlyExit,
69    double predBoostMin, bool MCMC, int maxIters, double approxLLtol, int mBlockMultX,
70    int Nautosomes) const {
71 
72     if (CVfoldsCompute <= 0) {
73       const int Nwant = 10000, Nrep = bolt.getNused() / CVfoldsSplit + 1;
74       CVfoldsCompute = std::min(CVfoldsSplit, (Nwant+Nrep-1) / Nrep);
75       cout << "Max CV folds to compute = " << CVfoldsCompute
76 	   << " (to have > " << Nwant << " samples)" << endl << endl;
77     }
78     if (CVfoldsCompute > CVfoldsSplit) {
79       cerr << "WARNING: CVfoldsCompute > CVfoldsSplit; setting CVfoldsCompute to " << CVfoldsSplit
80 	   << endl << endl;
81       CVfoldsCompute = CVfoldsSplit;
82     }
83 
84     int usedIters = 0;
85 
86     // try a fixed set of (f2, p) mixture param pairs
87     const int NUM_F2S = 3; const double f2s[NUM_F2S] = {0.5, 0.3, 0.1};
88     const int NUM_PS = 6; const double ps[NUM_PS] = {0.5, 0.2, 0.1, 0.05, 0.02, 0.01};
89     //const int NUM_PS = 9; const double ps[NUM_PS] = {0.5, 0.2, 0.1, 0.05, 0.02, 0.01, 0.005, 0.002, 0.001};
90 
91     // all (f2, p) pairs are in play at the start; this list will be pruned after each fold
92     // important: first pair corresponds to infinitesimal model
93     vector <ParamData> paramDataAll;
94     for (int f2i = 0; f2i < NUM_F2S; f2i++)
95       for (int pi = 0; pi < NUM_PS; pi++)
96 	paramDataAll.push_back(ParamData(f2s[f2i], ps[pi]));
97 
98     const double *maskIndivs = bolt.getMaskIndivs(); // possibly a subset of snpData.maskIndivs
99     uint64 Nstride = snpData.getNstride();
100     uint64 M = snpData.getM();
101 
102     // divide indivs into CVfoldsSplit folds
103     vector <int> foldAssignments(Nstride, -1); // -1 for masked indivs
104     int indCtr = 0;
105     for (uint64 n = 0; n < Nstride; n++)
106       if (maskIndivs[n])
107 	foldAssignments[n] = (indCtr++) % CVfoldsSplit;
108 
109     double *foldMaskIndivs = ALIGNED_MALLOC_DOUBLES(Nstride);
110     vector <double> baselineMSEs;
111 
112     // run OOS pred for each fold in turn
113     for (int fold = 0; fold < CVfoldsCompute; fold++) {
114 
115       cout << "====> Starting CV fold " << (fold+1) << " <====" << endl << endl;
116 
117       // set up fold assignment mask
118       for (uint64 n = 0; n < Nstride; n++)
119 	foldMaskIndivs[n] = (foldAssignments[n] != fold && foldAssignments[n] != -1);
120 
121       // create Bolt instance for predicting using non-left-out indivs
122       int foldCovarMaxLevels = 1<<30; // no need to re-check covar max levels
123       const Bolt boltFold(snpData, covarDataT, foldMaskIndivs, covars, foldCovarMaxLevels,
124 			  covarUseMissingIndic, mBlockMultX, Nautosomes);
125 
126       vector <double> PVEs; double baselinePredMSE;
127       { // set up arguments and call Bayes-iter
128 	uint64 B = paramDataAll.size(); // number of remaining (f2, p) pairs in play
129 
130 	double *phenoResidCovCompVecs = ALIGNED_MALLOC_DOUBLES(B*boltFold.getNCstride());
131 	boltFold.maskFillCovCompVecs(phenoResidCovCompVecs, &pheno[0], B);
132 
133 	double *betasTrans = ALIGNED_MALLOC_DOUBLES(M*B);
134 
135 	uchar *batchMaskSnps = ALIGNED_MALLOC_UCHARS(M*B);
136 	const uchar *projMaskSnpsFold = boltFold.getProjMaskSnps();
137 	for (uint64 m = 0; m < M; m++)
138 	  memset(batchMaskSnps + m*B, projMaskSnpsFold[m], B*sizeof(batchMaskSnps[0]));
139 
140 	uint64 MprojMaskFold = boltFold.getMprojMask();
141 	vector <uint64> Ms(B, MprojMaskFold);
142 
143 	vector <double> logDeltas(B, logDeltaEst);
144 	vector <double> sigma2Ks(B, sigma2Kest);
145 
146 	vector <double> varFrac2Ests(B), pEsts(B);
147 	for (uint64 b = 0; b < B; b++) {
148 	  varFrac2Ests[b] = paramDataAll[b].f2;
149 	  pEsts[b] = paramDataAll[b].p;
150 	}
151 
152 	// fit the models, one for each (f2, p) pair
153 	usedIters =
154 	  boltFold.batchComputeBayesIter(phenoResidCovCompVecs, betasTrans, batchMaskSnps,
155 					 &Ms[0], &logDeltas[0], &sigma2Ks[0], &varFrac2Ests[0],
156 					 &pEsts[0], B, MCMC, maxIters, approxLLtol);
157 
158 	// reset fold assignment mask to prediction indivs
159 	for (uint64 n = 0; n < Nstride; n++)
160 	  foldMaskIndivs[n] = (foldAssignments[n] == fold);
161 
162 	// build predictions and compute PVEs
163 	PVEs = boltFold.batchComputePredPVEs(&baselinePredMSE, &pheno[0], betasTrans, B,
164 					     foldMaskIndivs);
165 
166 	ALIGNED_FREE(batchMaskSnps);
167 	ALIGNED_FREE(betasTrans);
168 	ALIGNED_FREE(phenoResidCovCompVecs);
169       }
170 
171       baselineMSEs.push_back(baselinePredMSE);
172       for (uint64 b = 0; b < paramDataAll.size(); b++) {
173 	paramDataAll[b].PVEs.push_back(PVEs[b]);
174 	paramDataAll[b].MSEs.push_back(baselinePredMSE * (1-PVEs[b]));
175       }
176 
177 #ifdef VERBOSE
178       cout << endl << "Average PVEs obtained by param pairs tested (high to low):" << endl;
179       vector < std::pair <double, string> > avgPVEs;
180       for (uint64 b = 0; b < paramDataAll.size(); b++) {
181 	char buf[100]; sprintf(buf, "f2=%g, p=%g", paramDataAll[b].f2, paramDataAll[b].p);
182 	avgPVEs.push_back(std::make_pair(NumericUtils::mean(paramDataAll[b].PVEs), string(buf)));
183 	std::sort(avgPVEs.begin(), avgPVEs.end(), std::greater < std::pair <double, string> > ());
184       }
185       if (avgPVEs.size() <= 5)
186 	for (uint64 b = 0; b < avgPVEs.size(); b++)
187 	  printf("%15s: %f\n", avgPVEs[b].second.c_str(), avgPVEs[b].first);
188       else {
189 	for (int b = 0; b < 3; b++)
190 	  printf("%15s: %f\n", avgPVEs[b].second.c_str(), avgPVEs[b].first);
191 	printf("%15s\n", "...");
192 	printf("%15s: %f\n", avgPVEs.back().second.c_str(), avgPVEs.back().first);
193       }
194       cout << endl;
195 #endif
196 
197       double bestPVE = *std::max_element(PVEs.begin(), PVEs.end());
198       uint bestInd = std::max_element(PVEs.begin(), PVEs.end())-PVEs.begin();
199       char bestPars[100]; sprintf(bestPars, "f2=%g, p=%g", paramDataAll[bestInd].f2,
200 				  paramDataAll[bestInd].p);
201 
202 #ifdef VERBOSE
203       cout << "Detailed CV fold results:" << endl;
204       printf("  Absolute prediction MSE baseline (covariates only): %g\n", baselinePredMSE);
205       if (paramDataAll[0].f2 == 0.5 && paramDataAll[0].p == 0.5)
206 	printf("  Absolute prediction MSE using standard LMM:         %g\n",
207 	       paramDataAll[0].MSEs.back());
208       printf("  Absolute prediction MSE, fold-best %14s:  %g\n", bestPars,
209 	     paramDataAll[bestInd].MSEs.back());
210       for (uint b = 0; b < paramDataAll.size(); b++) {
211 	char buf[100]; sprintf(buf, "f2=%g, p=%g", paramDataAll[b].f2, paramDataAll[b].p);
212 	printf("    Absolute pred MSE using %15s: %f\n", buf, paramDataAll[b].MSEs.back());
213       }
214       cout << endl;
215 #endif
216 
217       // prune out significantly suboptimal param settings
218       if (!CVnoEarlyExit)
219 	for (int b = paramDataAll.size()-1; b >= 0; b--)
220 	  for (uint64 b2 = 0; b2 < paramDataAll.size(); b2++)
221 	    if (paramDataAll[b] < paramDataAll[b2]) {
222 	      paramDataAll.erase(paramDataAll.begin() + b);
223 	      break;
224 	    }
225 #ifdef VERBOSE
226       cout << "====> End CV fold " << (fold+1) << ": " << paramDataAll.size()
227 	   << " remaining param pair(s) <====" << endl << endl;
228 #endif
229 
230       if (fold == 0) { // set predBoost: 1 - (smallest MSE) / (inf model MSE)
231 	printf("Estimated proportion of variance explained using inf model: %.3f\n", PVEs[0]);
232 	*predBoost = 1 - (1-bestPVE) / (1-PVEs[0]);
233 	printf("Relative improvement in prediction MSE using non-inf model: %.3f\n\n", *predBoost);
234 	if (*predBoost < predBoostMin && !CVnoEarlyExit) {
235 	  printf("Exiting CV: non-inf model does not substantially improve prediction\n");
236 	  break;
237 	}
238       }
239 
240       // early exit if only one pair left
241       if (paramDataAll.size() == 1) {
242 	cout << "Finished cross-validation; params sufficiently constrained after "
243 	     << (fold+1) << " folds" << endl;
244 	break;
245       }
246     }
247 
248     if (CVnoEarlyExit) {
249       cout << "*** Combined results across all folds ***" << endl;
250       printf("Baseline MSE:   %g\n", NumericUtils::mean(baselineMSEs));
251       vector < std::pair <double, double> > MSEsToBaselineMeanStds(paramDataAll.size());
252       for (uint b = 0; b < paramDataAll.size(); b++)
253         MSEsToBaselineMeanStds[b] =
254 	  Jackknife::ratioOfSumsMeanStd(paramDataAll[b].MSEs, baselineMSEs);
255       uint bestInd = min_element(MSEsToBaselineMeanStds.begin(), MSEsToBaselineMeanStds.end())
256 	- MSEsToBaselineMeanStds.begin();
257       printf("Pred R^2 and MSE using standard LMM:   %6.3f (%.3f)   %g\n",
258 	     1-MSEsToBaselineMeanStds[0].first, MSEsToBaselineMeanStds[0].second,
259 	     NumericUtils::mean(paramDataAll[0].MSEs));
260       printf("Pred R^2 and MSE using best non-inf:   %6.3f (%.3f)   %g\n",
261 	     1-MSEsToBaselineMeanStds[bestInd].first, MSEsToBaselineMeanStds[bestInd].second,
262 	     NumericUtils::mean(paramDataAll[bestInd].MSEs));
263       for (uint b = 0; b < paramDataAll.size(); b++) {
264 	char buf[100]; sprintf(buf, "f2=%g, p=%g", paramDataAll[b].f2, paramDataAll[b].p);
265 	printf("  Pred R^2 and MSE using %15s: %6.3f (%.3f)   %g\n", buf,
266 	       1-MSEsToBaselineMeanStds[b].first, MSEsToBaselineMeanStds[b].second,
267 	       NumericUtils::mean(paramDataAll[b].MSEs));
268       }
269       cout << endl;
270     }
271 
272     // find best PVE; store corresponding f2, p in output params
273     double bestMeanPVE = -1e100;
274     for (uint64 b = 0; b < paramDataAll.size(); b++) {
275       double meanPVE = NumericUtils::mean(paramDataAll[b].PVEs);
276       if (meanPVE > bestMeanPVE) {
277 	bestMeanPVE = meanPVE;
278 	*f2Est = paramDataAll[b].f2;
279 	*pEst = paramDataAll[b].p;
280       }
281     }
282 
283     cout << "Optimal mixture parameters according to CV: f2 = " << *f2Est
284 	 << ", p = " << *pEst << endl;
285 
286     ALIGNED_FREE(foldMaskIndivs);
287 
288     return usedIters;
289   }
290 
291   // for use in PhenoBuilder to generate random phenotypes
getBoltRef(void) const292   const Bolt &BoltParEstCV::getBoltRef(void) const {
293     return bolt;
294   }
295 }
296