/*
This file is part of the BOLT-LMM linear mixed model software package
developed by Po-Ru Loh. Copyright (C) 2014-2019 Harvard University.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see .
*/
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include "LapackConst.hpp"
#include "Types.hpp"
#include "DataMatrix.hpp"
#include "MemoryUtils.hpp"
#include "CovariateBasis.hpp"
namespace LMM {
using std::vector;
using std::string;
using std::pair;
using std::cout;
using std::cerr;
using std::endl;
const double CovariateBasis::MIN_REL_SINGULAR_VALUE = 1e-8;
/*
// default init: use all covariates in covarDataT as quantitative... probably a bad idea
CovariateBasis::CovariateBasis(const DataMatrix &covarDataT, const double _maskIndivs[],
int covarMaxLevels) {
vector < pair > covars;
for (uint64 r = 0; r < covarDataT.rowNames.size(); r++)
covars.push_back(std::make_pair(covarDataT.rowNames[r], DataMatrix::QUANTITATIVE));
init(covarDataT, _maskIndivs, covars, covarMaxLevels);
}
*/
CovariateBasis::CovariateBasis(const DataMatrix &covarDataT, const double _maskIndivs[],
const vector < pair > &covars,
int covarMaxLevels, bool covarUseMissingIndic) {
init(covarDataT, _maskIndivs, covars, covarMaxLevels, covarUseMissingIndic);
}
/**
* input:
* - covarDataT is assumed to contain the all-1s vector as its first row (indexed 0)
* - _maskIndivs has dimension >= covarDataT.ncols = N (if > N, additional cols are masked)
* - assumed to be a subset of maskIndivs from original SnpData instance
* - (presumably obtained by using SnpData.writeMaskIndivs and taking a subset)
* action:
* - copies _maskIndivs into member maskIndivs (and optionally performs missing masking)
* - builds masked copy of selected covariates (from covarDataT) in maskedCovars[C x Nstride]
* - (possible later use: DGELS to get least squares coeffs wrt original covars)
* - mean-fills missing covariate values (using mean of non-masked, non-missing)
* - computes SVD; stores in basis[C x Nstride] and sets Cindep (warns if < C)
*/
void CovariateBasis::init(const DataMatrix &covarDataT, const double _maskIndivs[],
vector < pair > covars,
int covarMaxLevels, bool covarUseMissingIndic) {
// all-1s vector must always be included
if (std::find(covars.begin(), covars.end(),
std::make_pair(DataMatrix::ALL_ONES_NAME, DataMatrix::QUANTITATIVE))
== covars.end()) {
cerr << "NOTE: Using all-1s vector (constant term) in addition to specified covariates"
<< endl;
covars.push_back(std::make_pair(DataMatrix::ALL_ONES_NAME, DataMatrix::QUANTITATIVE));
}
// allocate maskIndivs; temporarily store missingness status
Nstride = covarDataT.ncols;
maskIndivs = ALIGNED_MALLOC_DOUBLES(Nstride);
for (uint64 n = 0; n < Nstride; n++) maskIndivs[n] = 1;
// select covariates to use; error-check covars
vector < std::pair > covarIndLevelPairs; // levels if categorical; o/w empty
const string QTVE_MISSING_INDICATOR = "QTVE_MISSING_INDICATOR";
for (vector < pair >::const_iterator it = covars.begin();
it != covars.end(); it++) {
bool found = false;
for (uint64 cData = 0; cData < covarDataT.nrows; cData++)
if (covarDataT.rowNames[cData] == it->first) {
if (it->second == DataMatrix::QUANTITATIVE) {
// update temp maskIndivs with missingness
for (uint64 n = 0; n < Nstride; n++)
if (covarDataT.getEntryDbl(cData, n) == covarDataT.missing_key_dbl)
maskIndivs[n] = 0;
cout << " Using quantitative covariate: " << covarDataT.rowNames[cData] << endl;
covarIndLevelPairs.push_back(std::make_pair(cData, ""));
if (covarUseMissingIndic && it->first != DataMatrix::ALL_ONES_NAME) {
cout << " (adding missing indicator: " << covarDataT.rowNames[cData] << ")"
<< endl;
covarIndLevelPairs.push_back(std::make_pair(cData, QTVE_MISSING_INDICATOR));
}
}
else {
// update temp maskIndivs with missingness
for (uint64 n = 0; n < Nstride; n++)
if (covarDataT.isMissing(covarDataT.data[cData][n]))
maskIndivs[n] = 0;
// note: missing indicator already included for categorical covars
for (std::set ::iterator levelsIter = covarDataT.rowUniqLevels[cData].begin();
levelsIter != covarDataT.rowUniqLevels[cData].end(); levelsIter++) {
if (covarDataT.isMissing(*levelsIter) && !covarUseMissingIndic) continue;
cout << " Using categorical covariate: " << covarDataT.rowNames[cData]
<< " (adding level " << *levelsIter << ")" << endl;
covarIndLevelPairs.push_back(std::make_pair(cData, *levelsIter));
}
if (covarDataT.rowUniqLevels[cData].size() > 10) {
cerr << "WARNING: Covariate " << covarDataT.rowNames[cData]
<< " has a large number of distinct values ("
<< covarDataT.rowUniqLevels[cData].size() << ")" << endl;
cerr << " Should this covariate be quantitative rather than categorical?"
<< endl;
}
if ((int) covarDataT.rowUniqLevels[cData].size() > covarMaxLevels) {
cerr << "ERROR: Number of distinct values of covariate "
<< covarDataT.rowNames[cData]
<< " exceeds max (" << covarMaxLevels << ")" << endl;
cerr << " Should this covariate be quantitative rather than categorical?"
<< endl;
cerr << "If not, set --covarMaxLevels to turn off error-check" << endl;
exit(1);
}
}
found = true;
break;
}
if (!found) {
cerr << "ERROR: Unable to find covariate " << it->first << endl;
exit(1);
}
}
C = covarIndLevelPairs.size();
Nused = 0; for (uint64 n = 0; n < Nstride; n++) Nused += (int) _maskIndivs[n];
int numSamplesMissingCovars = 0;
for (uint64 n = 0; n < Nstride; n++)
if (_maskIndivs[n] && !maskIndivs[n])
numSamplesMissingCovars++;
if (numSamplesMissingCovars) {
cerr << "WARNING: " << numSamplesMissingCovars << " of " << Nused
<< " samples passing previous QC have missing covariates" << endl;
if (covarUseMissingIndic)
cerr << " --covarUseMissingIndic is set, so these samples will still be analyzed" << endl;
else
cerr << " --covarUseMissingIndic is not set, so these samples will be removed" << endl;
}
if (covarUseMissingIndic) // overwrite maskIndivs with input mask (use all input maskIndivs)
memcpy(maskIndivs, _maskIndivs, Nstride * sizeof(maskIndivs[0]));
else // complete case option: intersect input mask with missing covars mask
for (uint64 n = 0; n < Nstride; n++)
maskIndivs[n] *= _maskIndivs[n];
Nused = 0; for (uint64 n = 0; n < Nstride; n++) Nused += (int) maskIndivs[n];
cout << "Number of individuals used in analysis: Nused = " << Nused << endl;
if (C > Nused) {
cerr << "ERROR: Number of covariates cannot exceed number of individuals" << endl;
exit(1);
}
// mask the covariate data matrix with maskIndivs
// fill in missing covariate values
// - use covarDataT.missing_key to figure out which are missing; take means over maskIndivs==1
// - TODO: PLINK seems by default to eliminate indivs with missing covars from analysis?
double *unmaskedCovars = ALIGNED_MALLOC_DOUBLES(C*Nstride);
basis = ALIGNED_MALLOC_DOUBLES(C*Nstride);
for (uint64 c = 0; c < C; c++) {
uint64 cData = covarIndLevelPairs[c].first;
const string &covarLevel = covarIndLevelPairs[c].second;
if (covarLevel == "") { // quantitative
for (uint64 n = 0; n < Nstride; n++) {
double value = covarDataT.getEntryDbl(cData, n);
if (value == covarDataT.missing_key_dbl) value = 0; // zero-fill missing values
unmaskedCovars[c*Nstride + n] = value;
}
}
else if (covarLevel == QTVE_MISSING_INDICATOR) {
for (uint64 n = 0; n < Nstride; n++) {
double value = covarDataT.getEntryDbl(cData, n);
unmaskedCovars[c*Nstride + n] = (value == covarDataT.missing_key_dbl ? 1 : 0);
}
}
else {
for (uint64 n = 0; n < Nstride; n++)
unmaskedCovars[c*Nstride + n] = (covarDataT.data[cData][n] == covarLevel);
}
for (uint64 n = 0; n < Nstride; n++) // temporarily holds masked covars; overwritten
basis[c*Nstride + n] = unmaskedCovars[c*Nstride + n] * maskIndivs[n];
}
// compute svd; set Cindep
double *S = ALIGNED_MALLOC_DOUBLES(C); // singular values
double *Vtrans = ALIGNED_MALLOC_DOUBLES(C*C); // V' stored in column-major order
{ // A (masked covariates) = U * Sigma * V'
char JOBU_ = 'O', JOBVT_ = 'A'; // overwrite input matrix with left singular vectors
int M_ = Nstride, N_ = C;
double *A_ = basis;
int LDA_ = Nstride;
double *S_ = S;
double *U_ = NULL;
int LDU_ = 1;
double *VT_ = Vtrans;
int LDVT_ = C;
int LWORK_ = 5*Nstride;
double *WORK_ = ALIGNED_MALLOC_DOUBLES(LWORK_);
int INFO_;
DGESVD_MACRO(&JOBU_, &JOBVT_, &M_, &N_, A_, &LDA_, S_, U_, &LDU_, VT_, &LDVT_,
WORK_, &LWORK_, &INFO_);
ALIGNED_FREE(WORK_);
if (INFO_ != 0) {
cerr << "ERROR: SVD computation for covariate matrix failed" << endl;
exit(1);
}
}
cout << "Singular values of covariate matrix:" << endl;
for (uint64 c = 0; c < C; c++)
cout << " S[" << c << "] = " << S[c] << endl;
for (Cindep = 0; Cindep < C; Cindep++)
if (S[Cindep] < S[0] * MIN_REL_SINGULAR_VALUE)
break;
cout << "Total covariate vectors: C = " << C << endl;
cout << "Total independent covariate vectors: Cindep = " << Cindep << endl;
basisExtAllIndivs = ALIGNED_MALLOC_DOUBLES(Cindep*Nstride);
// U ext = A (unmasked covariates) * V * Sigma^-1
{ // A (unmasked covariates) * V
char TRANSA_ = 'N';
char TRANSB_ = 'T';
int M_ = Nstride;
int N_ = Cindep;
int K_ = C;
double ALPHA_ = 1.0;
double *A_ = unmaskedCovars;
int LDA_ = Nstride;
double *B_ = Vtrans;
int LDB_ = C;
double BETA_ = 0.0;
double *C_ = basisExtAllIndivs;
int LDC_ = Nstride;
DGEMM_MACRO(&TRANSA_, &TRANSB_, &M_, &N_, &K_, &ALPHA_, A_, &LDA_, B_, &LDB_,
&BETA_, C_, &LDC_);
}
// multiply by Sigma^-1
for (uint64 c = 0; c < Cindep; c++) {
double invS = 1.0/S[c];
for (uint64 n = 0; n < Nstride; n++)
basisExtAllIndivs[c*Nstride+n] *= invS;
}
/*
for (uint64 c = 0; c < Cindep; c++)
for (uint64 n = 0; n < Nstride; n++)
if (fabs(basisExtAllIndivs[c*Nstride+n] - basis[c*Nstride+n]) > 1e-6) {
cout << "basis c=" << c << " n=" << n << ": " << basis[c*Nstride+n] << " ext: " << basisExtAllIndivs[c*Nstride+n] << endl;
}
*/
ALIGNED_FREE(Vtrans);
ALIGNED_FREE(S);
ALIGNED_FREE(unmaskedCovars);
}
CovariateBasis::~CovariateBasis() {
ALIGNED_FREE(maskIndivs);
ALIGNED_FREE(basis);
ALIGNED_FREE(basisExtAllIndivs);
}
//uint64 getC(void) const { return C; } -- don't expose this!
uint64 CovariateBasis::getCindep(void) const { return Cindep; }
const double *CovariateBasis::getMaskIndivs(void) const { return maskIndivs; }
uint64 CovariateBasis::getNused(void) const { return Nused; }
const double *CovariateBasis::getBasis(bool extAllIndivs) const {
return extAllIndivs ? basisExtAllIndivs : basis;
}
// vec is assumed to have dimension Nstride
// don't assume memory alignment (no SSE)
void CovariateBasis::applyMaskIndivs(double vec[]) const {
for (uint64 n = 0; n < Nstride; n++)
vec[n] *= maskIndivs[n];
}
/**
* Cindep values will be written to out[]
* vec[] has size Nstride
*/
void CovariateBasis::computeCindepComponents(double out[], const double vec[]) const {
char TRANS = 'T';
int M = Nstride, N = Cindep;
double ALPHA = 1;
// A = basis
int LDA = Nstride;
// X = vec
int INCX = 1;
double BETA = 0;
// Y = out
int INCY = 1;
DGEMV_MACRO(&TRANS, &M, &N, &ALPHA, basis, &LDA, vec, &INCX, &BETA, out, &INCY);
}
/**
* vec[] has size Nstride
* don't assume memory alignment (no SSE)
*/
void CovariateBasis::projectCovars(double vec[]) const {
double *covarComps = ALIGNED_MALLOC_DOUBLES(Cindep);
double *vecAligned = ALIGNED_MALLOC_DOUBLES(Nstride);
memcpy(vecAligned, vec, Nstride*sizeof(vec[0]));
computeCindepComponents(covarComps, vecAligned);
for (uint64 c = 0; c < Cindep; c++)
for (uint64 n = 0; n < Nstride; n++)
vec[n] -= covarComps[c] * basis[c*Nstride + n];
ALIGNED_FREE(vecAligned);
ALIGNED_FREE(covarComps);
}
// debugging function
void CovariateBasis::printProj(const double vec[], const char name[]) const {
double copyVec[Nstride]; memcpy(copyVec, vec, Nstride*sizeof(vec[0]));
projectCovars(copyVec);
printf("%s:\n", name);
for (uint64 n = 0; n < Nstride && n < 20; n++)
printf("%f\n", copyVec[n]);
printf("\n");
}
}