1 /*
2    This file is part of the BOLT-LMM linear mixed model software package
3    developed by Po-Ru Loh.  Copyright (C) 2014-2019 Harvard University.
4 
5    This program is free software: you can redistribute it and/or modify
6    it under the terms of the GNU General Public License as published by
7    the Free Software Foundation, either version 3 of the License, or
8    (at your option) any later version.
9 
10    This program is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13    GNU General Public License for more details.
14 
15    You should have received a copy of the GNU General Public License
16    along with this program.  If not, see <http://www.gnu.org/licenses/>.
17 */
18 
19 #include <cstdio>
20 #include <cstring>
21 #include <cmath>
22 #include <vector>
23 #include <string>
24 #include <iostream>
25 #include <numeric>
26 #include <algorithm>
27 #include <utility>
28 #include <boost/utility.hpp>
29 
30 #include "LapackConst.hpp"
31 #include "Types.hpp"
32 #include "DataMatrix.hpp"
33 #include "MemoryUtils.hpp"
34 #include "CovariateBasis.hpp"
35 
36 namespace LMM {
37 
38   using std::vector;
39   using std::string;
40   using std::pair;
41   using std::cout;
42   using std::cerr;
43   using std::endl;
44 
45   const double CovariateBasis::MIN_REL_SINGULAR_VALUE = 1e-8;
46 
47   /*
48   // default init: use all covariates in covarDataT as quantitative... probably a bad idea
49   CovariateBasis::CovariateBasis(const DataMatrix &covarDataT, const double _maskIndivs[],
50 				 int covarMaxLevels) {
51     vector < pair <string, DataMatrix::ValueType> > covars;
52     for (uint64 r = 0; r < covarDataT.rowNames.size(); r++)
53       covars.push_back(std::make_pair(covarDataT.rowNames[r], DataMatrix::QUANTITATIVE));
54     init(covarDataT, _maskIndivs, covars, covarMaxLevels);
55   }
56   */
57 
CovariateBasis(const DataMatrix & covarDataT,const double _maskIndivs[],const vector<pair<string,DataMatrix::ValueType>> & covars,int covarMaxLevels,bool covarUseMissingIndic)58   CovariateBasis::CovariateBasis(const DataMatrix &covarDataT, const double _maskIndivs[],
59 				 const vector < pair <string, DataMatrix::ValueType> > &covars,
60 				 int covarMaxLevels, bool covarUseMissingIndic) {
61     init(covarDataT, _maskIndivs, covars, covarMaxLevels, covarUseMissingIndic);
62   }
63 
64   /**
65    * input:
66    * - covarDataT is assumed to contain the all-1s vector as its first row (indexed 0)
67    * - _maskIndivs has dimension >= covarDataT.ncols = N (if > N, additional cols are masked)
68    *   - assumed to be a subset of maskIndivs from original SnpData instance
69    *   - (presumably obtained by using SnpData.writeMaskIndivs and taking a subset)
70    * action:
71    * - copies _maskIndivs into member maskIndivs (and optionally performs missing masking)
72    * - builds masked copy of selected covariates (from covarDataT) in maskedCovars[C x Nstride]
73    *   - (possible later use: DGELS to get least squares coeffs wrt original covars)
74    * - mean-fills missing covariate values (using mean of non-masked, non-missing)
75    * - computes SVD; stores in basis[C x Nstride] and sets Cindep (warns if < C)
76    */
init(const DataMatrix & covarDataT,const double _maskIndivs[],vector<pair<string,DataMatrix::ValueType>> covars,int covarMaxLevels,bool covarUseMissingIndic)77   void CovariateBasis::init(const DataMatrix &covarDataT, const double _maskIndivs[],
78 			    vector < pair <string, DataMatrix::ValueType> > covars,
79 			    int covarMaxLevels, bool covarUseMissingIndic) {
80 
81     // all-1s vector must always be included
82     if (std::find(covars.begin(), covars.end(),
83 		  std::make_pair(DataMatrix::ALL_ONES_NAME, DataMatrix::QUANTITATIVE))
84 	== covars.end()) {
85       cerr << "NOTE: Using all-1s vector (constant term) in addition to specified covariates"
86 	   << endl;
87       covars.push_back(std::make_pair(DataMatrix::ALL_ONES_NAME, DataMatrix::QUANTITATIVE));
88     }
89 
90     // allocate maskIndivs; temporarily store missingness status
91     Nstride = covarDataT.ncols;
92     maskIndivs = ALIGNED_MALLOC_DOUBLES(Nstride);
93     for (uint64 n = 0; n < Nstride; n++) maskIndivs[n] = 1;
94 
95     // select covariates to use; error-check covars
96     vector < std::pair <uint64, string> > covarIndLevelPairs; // levels if categorical; o/w empty
97     const string QTVE_MISSING_INDICATOR = "QTVE_MISSING_INDICATOR";
98     for (vector < pair <string, DataMatrix::ValueType> >::const_iterator it = covars.begin();
99 	 it != covars.end(); it++) {
100       bool found = false;
101       for (uint64 cData = 0; cData < covarDataT.nrows; cData++)
102 	if (covarDataT.rowNames[cData] == it->first) {
103 	  if (it->second == DataMatrix::QUANTITATIVE) {
104 	    // update temp maskIndivs with missingness
105 	    for (uint64 n = 0; n < Nstride; n++)
106 	      if (covarDataT.getEntryDbl(cData, n) == covarDataT.missing_key_dbl)
107 		maskIndivs[n] = 0;
108 
109 	    cout << "    Using quantitative covariate: " << covarDataT.rowNames[cData] << endl;
110 	    covarIndLevelPairs.push_back(std::make_pair(cData, ""));
111 	    if (covarUseMissingIndic && it->first != DataMatrix::ALL_ONES_NAME) {
112 	      cout << "     (adding missing indicator: " << covarDataT.rowNames[cData] << ")"
113 		   << endl;
114 	      covarIndLevelPairs.push_back(std::make_pair(cData, QTVE_MISSING_INDICATOR));
115 	    }
116 	  }
117 	  else {
118 	    // update temp maskIndivs with missingness
119 	    for (uint64 n = 0; n < Nstride; n++)
120 	      if (covarDataT.isMissing(covarDataT.data[cData][n]))
121 		maskIndivs[n] = 0;
122 
123 	    // note: missing indicator already included for categorical covars
124 	    for (std::set <string>::iterator levelsIter = covarDataT.rowUniqLevels[cData].begin();
125 		 levelsIter != covarDataT.rowUniqLevels[cData].end(); levelsIter++) {
126 	      if (covarDataT.isMissing(*levelsIter) && !covarUseMissingIndic) continue;
127 	      cout << "    Using categorical covariate: " << covarDataT.rowNames[cData]
128 		   << " (adding level " << *levelsIter << ")" << endl;
129 	      covarIndLevelPairs.push_back(std::make_pair(cData, *levelsIter));
130 	    }
131 	    if (covarDataT.rowUniqLevels[cData].size() > 10) {
132 	      cerr << "WARNING: Covariate " << covarDataT.rowNames[cData]
133 		   << " has a large number of distinct values ("
134 		   << covarDataT.rowUniqLevels[cData].size() << ")" << endl;
135 	      cerr << "         Should this covariate be quantitative rather than categorical?"
136 		   << endl;
137 	    }
138 	    if ((int) covarDataT.rowUniqLevels[cData].size() > covarMaxLevels) {
139 	      cerr << "ERROR: Number of distinct values of covariate "
140 		   << covarDataT.rowNames[cData]
141 		   << " exceeds max (" << covarMaxLevels << ")" << endl;
142 	      cerr << "       Should this covariate be quantitative rather than categorical?"
143 		   << endl;
144 	      cerr << "If not, set --covarMaxLevels to turn off error-check" << endl;
145 	      exit(1);
146 	    }
147 	  }
148 	  found = true;
149 	  break;
150 	}
151       if (!found) {
152 	cerr << "ERROR: Unable to find covariate " << it->first << endl;
153 	exit(1);
154       }
155     }
156 
157     C = covarIndLevelPairs.size();
158 
159     Nused = 0; for (uint64 n = 0; n < Nstride; n++) Nused += (int) _maskIndivs[n];
160     int numSamplesMissingCovars = 0;
161     for (uint64 n = 0; n < Nstride; n++)
162       if (_maskIndivs[n] && !maskIndivs[n])
163 	numSamplesMissingCovars++;
164     if (numSamplesMissingCovars) {
165       cerr << "WARNING: " << numSamplesMissingCovars << " of " << Nused
166 	   << " samples passing previous QC have missing covariates" << endl;
167       if (covarUseMissingIndic)
168 	cerr << "  --covarUseMissingIndic is set, so these samples will still be analyzed" << endl;
169       else
170 	cerr << "  --covarUseMissingIndic is not set, so these samples will be removed" << endl;
171     }
172 
173     if (covarUseMissingIndic) // overwrite maskIndivs with input mask (use all input maskIndivs)
174       memcpy(maskIndivs, _maskIndivs, Nstride * sizeof(maskIndivs[0]));
175     else // complete case option: intersect input mask with missing covars mask
176       for (uint64 n = 0; n < Nstride; n++)
177 	maskIndivs[n] *= _maskIndivs[n];
178 
179     Nused = 0; for (uint64 n = 0; n < Nstride; n++) Nused += (int) maskIndivs[n];
180     cout << "Number of individuals used in analysis: Nused = " << Nused << endl;
181 
182     if (C > Nused) {
183       cerr << "ERROR: Number of covariates cannot exceed number of individuals" << endl;
184       exit(1);
185     }
186 
187     // mask the covariate data matrix with maskIndivs
188     // fill in missing covariate values
189     // - use covarDataT.missing_key to figure out which are missing; take means over maskIndivs==1
190     // - TODO: PLINK seems by default to eliminate indivs with missing covars from analysis?
191     double *unmaskedCovars = ALIGNED_MALLOC_DOUBLES(C*Nstride);
192     basis = ALIGNED_MALLOC_DOUBLES(C*Nstride);
193     for (uint64 c = 0; c < C; c++) {
194       uint64 cData = covarIndLevelPairs[c].first;
195       const string &covarLevel = covarIndLevelPairs[c].second;
196       if (covarLevel == "") { // quantitative
197 	for (uint64 n = 0; n < Nstride; n++) {
198 	  double value = covarDataT.getEntryDbl(cData, n);
199 	  if (value == covarDataT.missing_key_dbl) value = 0; // zero-fill missing values
200 	  unmaskedCovars[c*Nstride + n] = value;
201 	}
202       }
203       else if (covarLevel == QTVE_MISSING_INDICATOR) {
204 	for (uint64 n = 0; n < Nstride; n++) {
205 	  double value = covarDataT.getEntryDbl(cData, n);
206 	  unmaskedCovars[c*Nstride + n] = (value == covarDataT.missing_key_dbl ? 1 : 0);
207 	}
208       }
209       else {
210 	for (uint64 n = 0; n < Nstride; n++)
211 	  unmaskedCovars[c*Nstride + n] = (covarDataT.data[cData][n] == covarLevel);
212       }
213       for (uint64 n = 0; n < Nstride; n++) // temporarily holds masked covars; overwritten
214 	basis[c*Nstride + n] = unmaskedCovars[c*Nstride + n] * maskIndivs[n];
215     }
216 
217     // compute svd; set Cindep
218     double *S = ALIGNED_MALLOC_DOUBLES(C); // singular values
219     double *Vtrans = ALIGNED_MALLOC_DOUBLES(C*C); // V' stored in column-major order
220 
221     { // A (masked covariates) = U * Sigma * V'
222       char JOBU_ = 'O', JOBVT_ = 'A'; // overwrite input matrix with left singular vectors
223       int M_ = Nstride, N_ = C;
224       double *A_ = basis;
225       int LDA_ = Nstride;
226       double *S_ = S;
227       double *U_ = NULL;
228       int LDU_ = 1;
229       double *VT_ = Vtrans;
230       int LDVT_ = C;
231       int LWORK_ = 5*Nstride;
232       double *WORK_ = ALIGNED_MALLOC_DOUBLES(LWORK_);
233       int INFO_;
234       DGESVD_MACRO(&JOBU_, &JOBVT_, &M_, &N_, A_, &LDA_, S_, U_, &LDU_, VT_, &LDVT_,
235 		   WORK_, &LWORK_, &INFO_);
236       ALIGNED_FREE(WORK_);
237       if (INFO_ != 0) {
238 	cerr << "ERROR: SVD computation for covariate matrix failed" << endl;
239 	exit(1);
240       }
241     }
242 
243     cout << "Singular values of covariate matrix:" << endl;
244     for (uint64 c = 0; c < C; c++)
245       cout << "    S[" << c << "] = " << S[c] << endl;
246 
247     for (Cindep = 0; Cindep < C; Cindep++)
248       if (S[Cindep] < S[0] * MIN_REL_SINGULAR_VALUE)
249 	break;
250 
251     cout << "Total covariate vectors: C = " << C << endl;
252     cout << "Total independent covariate vectors: Cindep = " << Cindep << endl;
253 
254     basisExtAllIndivs = ALIGNED_MALLOC_DOUBLES(Cindep*Nstride);
255     // U ext = A (unmasked covariates) * V * Sigma^-1
256     { // A (unmasked covariates) * V
257       char TRANSA_ = 'N';
258       char TRANSB_ = 'T';
259       int M_ = Nstride;
260       int N_ = Cindep;
261       int K_ = C;
262       double ALPHA_ = 1.0;
263       double *A_ = unmaskedCovars;
264       int LDA_ = Nstride;
265       double *B_ = Vtrans;
266       int LDB_ = C;
267       double BETA_ = 0.0;
268       double *C_ = basisExtAllIndivs;
269       int LDC_ = Nstride;
270       DGEMM_MACRO(&TRANSA_, &TRANSB_, &M_, &N_, &K_, &ALPHA_, A_, &LDA_, B_, &LDB_,
271 		  &BETA_, C_, &LDC_);
272     }
273     // multiply by Sigma^-1
274     for (uint64 c = 0; c < Cindep; c++) {
275       double invS = 1.0/S[c];
276       for (uint64 n = 0; n < Nstride; n++)
277 	basisExtAllIndivs[c*Nstride+n] *= invS;
278     }
279     /*
280     for (uint64 c = 0; c < Cindep; c++)
281       for (uint64 n = 0; n < Nstride; n++)
282 	if (fabs(basisExtAllIndivs[c*Nstride+n] - basis[c*Nstride+n]) > 1e-6) {
283 	  cout << "basis c=" << c << " n=" << n << ": " << basis[c*Nstride+n] << " ext: " << basisExtAllIndivs[c*Nstride+n] << endl;
284 	}
285     */
286     ALIGNED_FREE(Vtrans);
287     ALIGNED_FREE(S);
288     ALIGNED_FREE(unmaskedCovars);
289   }
290 
~CovariateBasis()291   CovariateBasis::~CovariateBasis() {
292     ALIGNED_FREE(maskIndivs);
293     ALIGNED_FREE(basis);
294     ALIGNED_FREE(basisExtAllIndivs);
295   }
296 
297   //uint64 getC(void) const { return C; } -- don't expose this!
getCindep(void) const298   uint64 CovariateBasis::getCindep(void) const { return Cindep; }
getMaskIndivs(void) const299   const double *CovariateBasis::getMaskIndivs(void) const { return maskIndivs; }
getNused(void) const300   uint64 CovariateBasis::getNused(void) const { return Nused; }
getBasis(bool extAllIndivs) const301   const double *CovariateBasis::getBasis(bool extAllIndivs) const {
302     return extAllIndivs ? basisExtAllIndivs : basis;
303   }
304 
305   // vec is assumed to have dimension Nstride
306   // don't assume memory alignment (no SSE)
applyMaskIndivs(double vec[]) const307   void CovariateBasis::applyMaskIndivs(double vec[]) const {
308     for (uint64 n = 0; n < Nstride; n++)
309       vec[n] *= maskIndivs[n];
310   }
311 
312   /**
313    * Cindep values will be written to out[]
314    * vec[] has size Nstride
315    */
computeCindepComponents(double out[],const double vec[]) const316   void CovariateBasis::computeCindepComponents(double out[], const double vec[]) const {
317     char TRANS = 'T';
318     int M = Nstride, N = Cindep;
319     double ALPHA = 1;
320     // A = basis
321     int LDA = Nstride;
322     // X = vec
323     int INCX = 1;
324     double BETA = 0;
325     // Y = out
326     int INCY = 1;
327     DGEMV_MACRO(&TRANS, &M, &N, &ALPHA, basis, &LDA, vec, &INCX, &BETA, out, &INCY);
328   }
329 
330   /**
331    * vec[] has size Nstride
332    * don't assume memory alignment (no SSE)
333    */
projectCovars(double vec[]) const334   void CovariateBasis::projectCovars(double vec[]) const {
335     double *covarComps = ALIGNED_MALLOC_DOUBLES(Cindep);
336     double *vecAligned = ALIGNED_MALLOC_DOUBLES(Nstride);
337     memcpy(vecAligned, vec, Nstride*sizeof(vec[0]));
338     computeCindepComponents(covarComps, vecAligned);
339     for (uint64 c = 0; c < Cindep; c++)
340       for (uint64 n = 0; n < Nstride; n++)
341 	vec[n] -= covarComps[c] * basis[c*Nstride + n];
342     ALIGNED_FREE(vecAligned);
343     ALIGNED_FREE(covarComps);
344   }
345 
346   // debugging function
printProj(const double vec[],const char name[]) const347   void CovariateBasis::printProj(const double vec[], const char name[]) const {
348     double copyVec[Nstride]; memcpy(copyVec, vec, Nstride*sizeof(vec[0]));
349     projectCovars(copyVec);
350     printf("%s:\n", name);
351     for (uint64 n = 0; n < Nstride && n < 20; n++)
352       printf("%f\n", copyVec[n]);
353     printf("\n");
354   }
355 }
356