1 /* 2 This file is part of the BOLT-LMM linear mixed model software package 3 developed by Po-Ru Loh. Copyright (C) 2014-2019 Harvard University. 4 5 This program is free software: you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation, either version 3 of the License, or 8 (at your option) any later version. 9 10 This program is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 You should have received a copy of the GNU General Public License 16 along with this program. If not, see <http://www.gnu.org/licenses/>. 17 */ 18 19 #include <cstdio> 20 #include <cstring> 21 #include <cmath> 22 #include <vector> 23 #include <string> 24 #include <iostream> 25 #include <numeric> 26 #include <algorithm> 27 #include <utility> 28 #include <boost/utility.hpp> 29 30 #include "LapackConst.hpp" 31 #include "Types.hpp" 32 #include "DataMatrix.hpp" 33 #include "MemoryUtils.hpp" 34 #include "CovariateBasis.hpp" 35 36 namespace LMM { 37 38 using std::vector; 39 using std::string; 40 using std::pair; 41 using std::cout; 42 using std::cerr; 43 using std::endl; 44 45 const double CovariateBasis::MIN_REL_SINGULAR_VALUE = 1e-8; 46 47 /* 48 // default init: use all covariates in covarDataT as quantitative... probably a bad idea 49 CovariateBasis::CovariateBasis(const DataMatrix &covarDataT, const double _maskIndivs[], 50 int covarMaxLevels) { 51 vector < pair <string, DataMatrix::ValueType> > covars; 52 for (uint64 r = 0; r < covarDataT.rowNames.size(); r++) 53 covars.push_back(std::make_pair(covarDataT.rowNames[r], DataMatrix::QUANTITATIVE)); 54 init(covarDataT, _maskIndivs, covars, covarMaxLevels); 55 } 56 */ 57 CovariateBasis(const DataMatrix & covarDataT,const double _maskIndivs[],const vector<pair<string,DataMatrix::ValueType>> & covars,int covarMaxLevels,bool covarUseMissingIndic)58 CovariateBasis::CovariateBasis(const DataMatrix &covarDataT, const double _maskIndivs[], 59 const vector < pair <string, DataMatrix::ValueType> > &covars, 60 int covarMaxLevels, bool covarUseMissingIndic) { 61 init(covarDataT, _maskIndivs, covars, covarMaxLevels, covarUseMissingIndic); 62 } 63 64 /** 65 * input: 66 * - covarDataT is assumed to contain the all-1s vector as its first row (indexed 0) 67 * - _maskIndivs has dimension >= covarDataT.ncols = N (if > N, additional cols are masked) 68 * - assumed to be a subset of maskIndivs from original SnpData instance 69 * - (presumably obtained by using SnpData.writeMaskIndivs and taking a subset) 70 * action: 71 * - copies _maskIndivs into member maskIndivs (and optionally performs missing masking) 72 * - builds masked copy of selected covariates (from covarDataT) in maskedCovars[C x Nstride] 73 * - (possible later use: DGELS to get least squares coeffs wrt original covars) 74 * - mean-fills missing covariate values (using mean of non-masked, non-missing) 75 * - computes SVD; stores in basis[C x Nstride] and sets Cindep (warns if < C) 76 */ init(const DataMatrix & covarDataT,const double _maskIndivs[],vector<pair<string,DataMatrix::ValueType>> covars,int covarMaxLevels,bool covarUseMissingIndic)77 void CovariateBasis::init(const DataMatrix &covarDataT, const double _maskIndivs[], 78 vector < pair <string, DataMatrix::ValueType> > covars, 79 int covarMaxLevels, bool covarUseMissingIndic) { 80 81 // all-1s vector must always be included 82 if (std::find(covars.begin(), covars.end(), 83 std::make_pair(DataMatrix::ALL_ONES_NAME, DataMatrix::QUANTITATIVE)) 84 == covars.end()) { 85 cerr << "NOTE: Using all-1s vector (constant term) in addition to specified covariates" 86 << endl; 87 covars.push_back(std::make_pair(DataMatrix::ALL_ONES_NAME, DataMatrix::QUANTITATIVE)); 88 } 89 90 // allocate maskIndivs; temporarily store missingness status 91 Nstride = covarDataT.ncols; 92 maskIndivs = ALIGNED_MALLOC_DOUBLES(Nstride); 93 for (uint64 n = 0; n < Nstride; n++) maskIndivs[n] = 1; 94 95 // select covariates to use; error-check covars 96 vector < std::pair <uint64, string> > covarIndLevelPairs; // levels if categorical; o/w empty 97 const string QTVE_MISSING_INDICATOR = "QTVE_MISSING_INDICATOR"; 98 for (vector < pair <string, DataMatrix::ValueType> >::const_iterator it = covars.begin(); 99 it != covars.end(); it++) { 100 bool found = false; 101 for (uint64 cData = 0; cData < covarDataT.nrows; cData++) 102 if (covarDataT.rowNames[cData] == it->first) { 103 if (it->second == DataMatrix::QUANTITATIVE) { 104 // update temp maskIndivs with missingness 105 for (uint64 n = 0; n < Nstride; n++) 106 if (covarDataT.getEntryDbl(cData, n) == covarDataT.missing_key_dbl) 107 maskIndivs[n] = 0; 108 109 cout << " Using quantitative covariate: " << covarDataT.rowNames[cData] << endl; 110 covarIndLevelPairs.push_back(std::make_pair(cData, "")); 111 if (covarUseMissingIndic && it->first != DataMatrix::ALL_ONES_NAME) { 112 cout << " (adding missing indicator: " << covarDataT.rowNames[cData] << ")" 113 << endl; 114 covarIndLevelPairs.push_back(std::make_pair(cData, QTVE_MISSING_INDICATOR)); 115 } 116 } 117 else { 118 // update temp maskIndivs with missingness 119 for (uint64 n = 0; n < Nstride; n++) 120 if (covarDataT.isMissing(covarDataT.data[cData][n])) 121 maskIndivs[n] = 0; 122 123 // note: missing indicator already included for categorical covars 124 for (std::set <string>::iterator levelsIter = covarDataT.rowUniqLevels[cData].begin(); 125 levelsIter != covarDataT.rowUniqLevels[cData].end(); levelsIter++) { 126 if (covarDataT.isMissing(*levelsIter) && !covarUseMissingIndic) continue; 127 cout << " Using categorical covariate: " << covarDataT.rowNames[cData] 128 << " (adding level " << *levelsIter << ")" << endl; 129 covarIndLevelPairs.push_back(std::make_pair(cData, *levelsIter)); 130 } 131 if (covarDataT.rowUniqLevels[cData].size() > 10) { 132 cerr << "WARNING: Covariate " << covarDataT.rowNames[cData] 133 << " has a large number of distinct values (" 134 << covarDataT.rowUniqLevels[cData].size() << ")" << endl; 135 cerr << " Should this covariate be quantitative rather than categorical?" 136 << endl; 137 } 138 if ((int) covarDataT.rowUniqLevels[cData].size() > covarMaxLevels) { 139 cerr << "ERROR: Number of distinct values of covariate " 140 << covarDataT.rowNames[cData] 141 << " exceeds max (" << covarMaxLevels << ")" << endl; 142 cerr << " Should this covariate be quantitative rather than categorical?" 143 << endl; 144 cerr << "If not, set --covarMaxLevels to turn off error-check" << endl; 145 exit(1); 146 } 147 } 148 found = true; 149 break; 150 } 151 if (!found) { 152 cerr << "ERROR: Unable to find covariate " << it->first << endl; 153 exit(1); 154 } 155 } 156 157 C = covarIndLevelPairs.size(); 158 159 Nused = 0; for (uint64 n = 0; n < Nstride; n++) Nused += (int) _maskIndivs[n]; 160 int numSamplesMissingCovars = 0; 161 for (uint64 n = 0; n < Nstride; n++) 162 if (_maskIndivs[n] && !maskIndivs[n]) 163 numSamplesMissingCovars++; 164 if (numSamplesMissingCovars) { 165 cerr << "WARNING: " << numSamplesMissingCovars << " of " << Nused 166 << " samples passing previous QC have missing covariates" << endl; 167 if (covarUseMissingIndic) 168 cerr << " --covarUseMissingIndic is set, so these samples will still be analyzed" << endl; 169 else 170 cerr << " --covarUseMissingIndic is not set, so these samples will be removed" << endl; 171 } 172 173 if (covarUseMissingIndic) // overwrite maskIndivs with input mask (use all input maskIndivs) 174 memcpy(maskIndivs, _maskIndivs, Nstride * sizeof(maskIndivs[0])); 175 else // complete case option: intersect input mask with missing covars mask 176 for (uint64 n = 0; n < Nstride; n++) 177 maskIndivs[n] *= _maskIndivs[n]; 178 179 Nused = 0; for (uint64 n = 0; n < Nstride; n++) Nused += (int) maskIndivs[n]; 180 cout << "Number of individuals used in analysis: Nused = " << Nused << endl; 181 182 if (C > Nused) { 183 cerr << "ERROR: Number of covariates cannot exceed number of individuals" << endl; 184 exit(1); 185 } 186 187 // mask the covariate data matrix with maskIndivs 188 // fill in missing covariate values 189 // - use covarDataT.missing_key to figure out which are missing; take means over maskIndivs==1 190 // - TODO: PLINK seems by default to eliminate indivs with missing covars from analysis? 191 double *unmaskedCovars = ALIGNED_MALLOC_DOUBLES(C*Nstride); 192 basis = ALIGNED_MALLOC_DOUBLES(C*Nstride); 193 for (uint64 c = 0; c < C; c++) { 194 uint64 cData = covarIndLevelPairs[c].first; 195 const string &covarLevel = covarIndLevelPairs[c].second; 196 if (covarLevel == "") { // quantitative 197 for (uint64 n = 0; n < Nstride; n++) { 198 double value = covarDataT.getEntryDbl(cData, n); 199 if (value == covarDataT.missing_key_dbl) value = 0; // zero-fill missing values 200 unmaskedCovars[c*Nstride + n] = value; 201 } 202 } 203 else if (covarLevel == QTVE_MISSING_INDICATOR) { 204 for (uint64 n = 0; n < Nstride; n++) { 205 double value = covarDataT.getEntryDbl(cData, n); 206 unmaskedCovars[c*Nstride + n] = (value == covarDataT.missing_key_dbl ? 1 : 0); 207 } 208 } 209 else { 210 for (uint64 n = 0; n < Nstride; n++) 211 unmaskedCovars[c*Nstride + n] = (covarDataT.data[cData][n] == covarLevel); 212 } 213 for (uint64 n = 0; n < Nstride; n++) // temporarily holds masked covars; overwritten 214 basis[c*Nstride + n] = unmaskedCovars[c*Nstride + n] * maskIndivs[n]; 215 } 216 217 // compute svd; set Cindep 218 double *S = ALIGNED_MALLOC_DOUBLES(C); // singular values 219 double *Vtrans = ALIGNED_MALLOC_DOUBLES(C*C); // V' stored in column-major order 220 221 { // A (masked covariates) = U * Sigma * V' 222 char JOBU_ = 'O', JOBVT_ = 'A'; // overwrite input matrix with left singular vectors 223 int M_ = Nstride, N_ = C; 224 double *A_ = basis; 225 int LDA_ = Nstride; 226 double *S_ = S; 227 double *U_ = NULL; 228 int LDU_ = 1; 229 double *VT_ = Vtrans; 230 int LDVT_ = C; 231 int LWORK_ = 5*Nstride; 232 double *WORK_ = ALIGNED_MALLOC_DOUBLES(LWORK_); 233 int INFO_; 234 DGESVD_MACRO(&JOBU_, &JOBVT_, &M_, &N_, A_, &LDA_, S_, U_, &LDU_, VT_, &LDVT_, 235 WORK_, &LWORK_, &INFO_); 236 ALIGNED_FREE(WORK_); 237 if (INFO_ != 0) { 238 cerr << "ERROR: SVD computation for covariate matrix failed" << endl; 239 exit(1); 240 } 241 } 242 243 cout << "Singular values of covariate matrix:" << endl; 244 for (uint64 c = 0; c < C; c++) 245 cout << " S[" << c << "] = " << S[c] << endl; 246 247 for (Cindep = 0; Cindep < C; Cindep++) 248 if (S[Cindep] < S[0] * MIN_REL_SINGULAR_VALUE) 249 break; 250 251 cout << "Total covariate vectors: C = " << C << endl; 252 cout << "Total independent covariate vectors: Cindep = " << Cindep << endl; 253 254 basisExtAllIndivs = ALIGNED_MALLOC_DOUBLES(Cindep*Nstride); 255 // U ext = A (unmasked covariates) * V * Sigma^-1 256 { // A (unmasked covariates) * V 257 char TRANSA_ = 'N'; 258 char TRANSB_ = 'T'; 259 int M_ = Nstride; 260 int N_ = Cindep; 261 int K_ = C; 262 double ALPHA_ = 1.0; 263 double *A_ = unmaskedCovars; 264 int LDA_ = Nstride; 265 double *B_ = Vtrans; 266 int LDB_ = C; 267 double BETA_ = 0.0; 268 double *C_ = basisExtAllIndivs; 269 int LDC_ = Nstride; 270 DGEMM_MACRO(&TRANSA_, &TRANSB_, &M_, &N_, &K_, &ALPHA_, A_, &LDA_, B_, &LDB_, 271 &BETA_, C_, &LDC_); 272 } 273 // multiply by Sigma^-1 274 for (uint64 c = 0; c < Cindep; c++) { 275 double invS = 1.0/S[c]; 276 for (uint64 n = 0; n < Nstride; n++) 277 basisExtAllIndivs[c*Nstride+n] *= invS; 278 } 279 /* 280 for (uint64 c = 0; c < Cindep; c++) 281 for (uint64 n = 0; n < Nstride; n++) 282 if (fabs(basisExtAllIndivs[c*Nstride+n] - basis[c*Nstride+n]) > 1e-6) { 283 cout << "basis c=" << c << " n=" << n << ": " << basis[c*Nstride+n] << " ext: " << basisExtAllIndivs[c*Nstride+n] << endl; 284 } 285 */ 286 ALIGNED_FREE(Vtrans); 287 ALIGNED_FREE(S); 288 ALIGNED_FREE(unmaskedCovars); 289 } 290 ~CovariateBasis()291 CovariateBasis::~CovariateBasis() { 292 ALIGNED_FREE(maskIndivs); 293 ALIGNED_FREE(basis); 294 ALIGNED_FREE(basisExtAllIndivs); 295 } 296 297 //uint64 getC(void) const { return C; } -- don't expose this! getCindep(void) const298 uint64 CovariateBasis::getCindep(void) const { return Cindep; } getMaskIndivs(void) const299 const double *CovariateBasis::getMaskIndivs(void) const { return maskIndivs; } getNused(void) const300 uint64 CovariateBasis::getNused(void) const { return Nused; } getBasis(bool extAllIndivs) const301 const double *CovariateBasis::getBasis(bool extAllIndivs) const { 302 return extAllIndivs ? basisExtAllIndivs : basis; 303 } 304 305 // vec is assumed to have dimension Nstride 306 // don't assume memory alignment (no SSE) applyMaskIndivs(double vec[]) const307 void CovariateBasis::applyMaskIndivs(double vec[]) const { 308 for (uint64 n = 0; n < Nstride; n++) 309 vec[n] *= maskIndivs[n]; 310 } 311 312 /** 313 * Cindep values will be written to out[] 314 * vec[] has size Nstride 315 */ computeCindepComponents(double out[],const double vec[]) const316 void CovariateBasis::computeCindepComponents(double out[], const double vec[]) const { 317 char TRANS = 'T'; 318 int M = Nstride, N = Cindep; 319 double ALPHA = 1; 320 // A = basis 321 int LDA = Nstride; 322 // X = vec 323 int INCX = 1; 324 double BETA = 0; 325 // Y = out 326 int INCY = 1; 327 DGEMV_MACRO(&TRANS, &M, &N, &ALPHA, basis, &LDA, vec, &INCX, &BETA, out, &INCY); 328 } 329 330 /** 331 * vec[] has size Nstride 332 * don't assume memory alignment (no SSE) 333 */ projectCovars(double vec[]) const334 void CovariateBasis::projectCovars(double vec[]) const { 335 double *covarComps = ALIGNED_MALLOC_DOUBLES(Cindep); 336 double *vecAligned = ALIGNED_MALLOC_DOUBLES(Nstride); 337 memcpy(vecAligned, vec, Nstride*sizeof(vec[0])); 338 computeCindepComponents(covarComps, vecAligned); 339 for (uint64 c = 0; c < Cindep; c++) 340 for (uint64 n = 0; n < Nstride; n++) 341 vec[n] -= covarComps[c] * basis[c*Nstride + n]; 342 ALIGNED_FREE(vecAligned); 343 ALIGNED_FREE(covarComps); 344 } 345 346 // debugging function printProj(const double vec[],const char name[]) const347 void CovariateBasis::printProj(const double vec[], const char name[]) const { 348 double copyVec[Nstride]; memcpy(copyVec, vec, Nstride*sizeof(vec[0])); 349 projectCovars(copyVec); 350 printf("%s:\n", name); 351 for (uint64 n = 0; n < Nstride && n < 20; n++) 352 printf("%f\n", copyVec[n]); 353 printf("\n"); 354 } 355 } 356