1 /* 2 This file is part of the BOLT-LMM linear mixed model software package 3 developed by Po-Ru Loh. Copyright (C) 2014-2019 Harvard University. 4 5 This program is free software: you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation, either version 3 of the License, or 8 (at your option) any later version. 9 10 This program is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 You should have received a copy of the GNU General Public License 16 along with this program. If not, see <http://www.gnu.org/licenses/>. 17 */ 18 19 #include <cmath> 20 #include <cstring> 21 #include <cstdio> 22 #include <vector> 23 #include <string> 24 #include <iostream> 25 #include <sstream> 26 #include <fstream> 27 #include <map> 28 #include <set> 29 #include <algorithm> 30 #include <numeric> 31 #include <utility> 32 33 #include "omp.h" 34 35 #include <boost/random.hpp> 36 #include <boost/random/mersenne_twister.hpp> 37 #include <boost/random/normal_distribution.hpp> 38 #include <boost/numeric/ublas/vector.hpp> 39 #include <boost/numeric/ublas/matrix.hpp> 40 #include <boost/numeric/ublas/io.hpp> 41 42 #include "Types.hpp" 43 #include "Timer.hpp" 44 #include "SnpData.hpp" 45 #include "CovariateBasis.hpp" 46 #include "NumericUtils.hpp" 47 #include "LapackConst.hpp" 48 #include "MemoryUtils.hpp" 49 #include "Jackknife.hpp" 50 #include "StatsUtils.hpp" 51 #include "MatrixUtils.hpp" 52 #include "NonlinearOptMulti.hpp" 53 #include "Bolt.hpp" 54 55 namespace LMM { 56 57 namespace ublas = boost::numeric::ublas; 58 using std::vector; 59 using std::string; 60 using std::pair; 61 using std::cout; 62 using std::cerr; 63 using std::endl; 64 65 /** 66 * for k=1..VCs, computes (Theta_k[b] - coeffI*I) * x[b] = (Z_k[b] * Z_k[b]' - coeffI*I) * x[b], 67 * where 68 * Z_k[b] = sqrt(vcXscale2s[k]) X_k 69 * Theta_k[b] = vcXscale2s[k] X_k * X_k' 70 * (see conjGradSolveW for more info) 71 * 72 * multCovCompVecs: (out) VCs x B x (Nstride+Cstride) 73 * xCovCompVecs: (in) B x (Nstride+Cstride) 74 * h2Xscale2batches: (in) (1+VCs) x B table of combined scale factors for easy application 75 */ multThetaMinusIs(double multCovCompVecs[],const double xCovCompVecs[],const uchar snpVCnums[],const vector<double> & vcXscale2s,uint64 B,double coeffI) const76 void Bolt::multThetaMinusIs(double multCovCompVecs[], const double xCovCompVecs[], 77 const uchar snpVCnums[], const vector <double> &vcXscale2s, 78 uint64 B, double coeffI) const { 79 #ifdef VERBOSE 80 Timer timer; 81 printf(" Multiplying solutions by variance components... "); 82 fflush(stdout); 83 #endif 84 int VCs = vcXscale2s.size()-1; 85 const uint64 BxNC = B*(Nstride+Cstride); 86 memset(multCovCompVecs, 0, VCs * BxNC * sizeof(multCovCompVecs[0])); // initialize answers to 0 87 88 double *snpCovCompVecBlock = ALIGNED_MALLOC_DOUBLES(mBlockMultX * (Nstride+Cstride)); 89 double (*work)[4] = (double (*)[4]) ALIGNED_MALLOC(omp_get_max_threads() * 256*sizeof(*work)); 90 double *XtransVecsBlock = ALIGNED_MALLOC_DOUBLES(mBlockMultX * B); 91 92 // for each SNP block... 93 for (uint64 m0 = 0; m0 < M; m0 += mBlockMultX) { 94 95 // (1) load the SNPs in parallel into Xblock [Nstride+Cstride x block size] 96 uint64 mBlockMultXCrop = std::min(M, m0+mBlockMultX) - m0; 97 #pragma omp parallel for 98 for (uint64 mPlus = 0; mPlus < mBlockMultXCrop; mPlus++) { 99 uint64 m = m0+mPlus; 100 if (projMaskSnps[m] && snpVCnums[m]) // build snp vector + sign-flipped covar comps 101 buildMaskedSnpNegCovCompVec(snpCovCompVecBlock + mPlus * (Nstride+Cstride), m, 102 work + (omp_get_thread_num()<<8)); 103 else 104 memset(snpCovCompVecBlock + mPlus * (Nstride+Cstride), 0, 105 (Nstride+Cstride) * sizeof(snpCovCompVecBlock[0])); 106 } 107 108 // (2) multiply Xblock' [block size x Nstride+Cstride] * xVecs [Nstride+Cstride x B] 109 // = XtransVecsBlock [block size x B] 110 // (note that Xblock' has NEG CCVecs while xVecs has POS CCVecs) 111 { 112 char TRANSA_ = 'T'; 113 char TRANSB_ = 'N'; 114 int M_ = B; 115 int N_ = mBlockMultXCrop; 116 int K_ = Nstride+Cstride; 117 double ALPHA_ = 1.0; 118 const double *A_ = xCovCompVecs; 119 int LDA_ = Nstride+Cstride; 120 double *B_ = snpCovCompVecBlock; 121 int LDB_ = Nstride+Cstride; 122 double BETA_ = 0.0; 123 double *C_ = XtransVecsBlock; 124 int LDC_ = B; 125 126 DGEMM_MACRO(&TRANSA_, &TRANSB_, &M_, &N_, &K_, &ALPHA_, A_, &LDA_, B_, &LDB_, 127 &BETA_, C_, &LDC_); 128 } 129 130 // (4) for each snp individually in turn: 131 // multiply Xblock_mPlus [NCstride x 1] * XtransVecsBlock_mPlus [1 x B] 132 // = snp's contrib. to (k-1)^th result block (X_k X_k') * xVecs [NCstride x B] 133 // directly accumulate results in (k-1)^th answer mult [NCstride x B] 134 for (uint64 mPlus = 0; mPlus < mBlockMultXCrop; mPlus++) { 135 uint64 m = m0+mPlus; 136 if (projMaskSnps[m] && snpVCnums[m]) { 137 double *snpCovCompVec = snpCovCompVecBlock + mPlus * (Nstride+Cstride); 138 // note that Xblock was loaded with NEG CCVecs; now we need to sign-flip CCVecs 139 for (uint64 n = Nstride; n < Nstride+Cstride; n++) 140 snpCovCompVec[n] *= -1; 141 142 // update result block for snpVCnums[m]: add XtransVecsBlock_mPlus * snpCovCompVec (DGER) 143 { 144 int M_ = Nstride+Cstride; 145 int N_ = B; 146 double ALPHA_ = 1; 147 double *X_ = snpCovCompVec; 148 int INCX_ = 1; 149 double *Y_ = XtransVecsBlock + mPlus * B; 150 int INCY_ = 1; 151 double *A_ = multCovCompVecs + (snpVCnums[m]-1) * BxNC; 152 int LDA_ = Nstride+Cstride; 153 DGER_MACRO(&M_, &N_, &ALPHA_, X_, &INCX_, Y_, &INCY_, A_, &LDA_); 154 } 155 } 156 } 157 } 158 ALIGNED_FREE(XtransVecsBlock); 159 ALIGNED_FREE(work); 160 ALIGNED_FREE(snpCovCompVecBlock); 161 162 // multiply vcXscale2s and subtract identity contributions 163 for (int v = 0; v < VCs; v++) 164 for (uint64 bnc = 0; bnc < BxNC; bnc++) 165 multCovCompVecs[v*BxNC + bnc] = 166 vcXscale2s[v+1] * multCovCompVecs[v*BxNC + bnc] - coeffI * xCovCompVecs[bnc]; 167 #ifdef VERBOSE 168 printf("time=%.2f\n", timer.update_time()); 169 fflush(stdout); 170 #endif 171 } 172 rightMultiT(double * x,uint64 D,uint64 stride,const ublas::matrix<double> & mat)173 void rightMultiT(double *x, uint64 D, uint64 stride, const ublas::matrix <double> &mat) { 174 vector <double> tmpD(D); 175 for (uint64 d = 0; d < D; d++) 176 for (uint64 i = 0; i < D; i++) 177 tmpD[d] += x[i*stride] * mat(d, i); 178 for (uint64 d = 0; d < D; d++) 179 x[d*stride] = tmpD[d]; 180 } 181 dotMultiCovCompVecs(const double xCovCompVecs[],const double yCovCompVecs[],uint64 D) const182 double Bolt::dotMultiCovCompVecs(const double xCovCompVecs[], const double yCovCompVecs[], 183 uint64 D) const { 184 double ret = 0; 185 for (uint64 d = 0; d < D; d++) 186 ret += dotCovCompVecs(xCovCompVecs+d*(Nstride+Cstride), yCovCompVecs+d*(Nstride+Cstride)); 187 return ret; 188 } 189 computeMultiProjNorm2s(double projNorm2s[],const double xCovCompVecs[],uint64 D,uint64 B) const190 void Bolt::computeMultiProjNorm2s(double projNorm2s[], const double xCovCompVecs[], uint64 D, 191 uint64 B) const { 192 for (uint64 b = 0; b < B; b++) { 193 projNorm2s[b] = 0; 194 for (uint64 d = 0; d < D; d++) 195 projNorm2s[b] += computeProjNorm2(xCovCompVecs + (b*D+d)*(Nstride+Cstride)); 196 } 197 } 198 199 /** 200 * TODO 201 * NOTE: VinvyMultiCovCompVecs is destroyed if Vegs[0] is non-identity! 202 */ updateVegs(vector<ublas::matrix<double>> & Vegs,double VinvyMultiCovCompVecs[],const uchar snpVCnums[],const vector<double> & vcXscale2s,int MCtrials) const203 ublas::vector <double> Bolt::updateVegs(vector < ublas::matrix <double> > &Vegs, 204 double VinvyMultiCovCompVecs[], const uchar snpVCnums[], 205 const vector <double> &vcXscale2s, int MCtrials) const { 206 207 int VCs = Vegs.size()-1; 208 uint64 D = Vegs[0].size1(); 209 vector < ublas::matrix <double> > deltaVegs(1+VCs, ublas::zero_matrix <double> (D, D)), 210 VegXscales(1+VCs); 211 for (int k = 0; k <= VCs; k++) 212 VegXscales[k] = (k == 0 ? 1 : sqrt(vcXscale2s[k])) * Vegs[k]; 213 vector <double> denoms(1+VCs); 214 denoms[0] = (double) (Nused-Cindep); 215 for (uint64 m = 0; m < M; m++) 216 if (projMaskSnps[m] && snpVCnums[m]) 217 denoms[snpVCnums[m]]++; 218 219 uint64 DxNC = D * (Nstride+Cstride); 220 221 uint64 B = MCtrials+1; 222 double *snpCovCompVecBlock = ALIGNED_MALLOC_DOUBLES(mBlockMultX * (Nstride+Cstride)); 223 double (*work)[4] = (double (*)[4]) ALIGNED_MALLOC(omp_get_max_threads() * 256*sizeof(*work)); 224 double *XtransVecsBlock = ALIGNED_MALLOC_DOUBLES(mBlockMultX * B * D); 225 226 // for each SNP block... 227 for (uint64 m0 = 0; m0 < M; m0 += mBlockMultX) { 228 229 // (1) load the SNPs in parallel into Xblock [Nstride+Cstride x block size] 230 uint64 mBlockMultXCrop = std::min(M, m0+mBlockMultX) - m0; 231 #pragma omp parallel for 232 for (uint64 mPlus = 0; mPlus < mBlockMultXCrop; mPlus++) { 233 uint64 m = m0+mPlus; 234 if (projMaskSnps[m] && snpVCnums[m]) // build snp vector + sign-flipped covar comps 235 buildMaskedSnpNegCovCompVec(snpCovCompVecBlock + mPlus * (Nstride+Cstride), m, 236 work + (omp_get_thread_num()<<8)); 237 else 238 memset(snpCovCompVecBlock + mPlus * (Nstride+Cstride), 0, 239 (Nstride+Cstride) * sizeof(snpCovCompVecBlock[0])); 240 } 241 242 // (2) multiply Xblock' [block size x Nstride+Cstride] * xMultiVecs [Nstride+Cstride x BxD] 243 // = XtransVecsBlock [block size x BxD] 244 // (note that Xblock' has NEG CCVecs while xVecs has POS CCVecs) 245 { 246 #ifdef MEASURE_DGEMM 247 //Timer timer; 248 unsigned long long tsc = Timer::rdtsc(); 249 #endif 250 char TRANSA_ = 'T'; 251 char TRANSB_ = 'N'; 252 int M_ = B * D; 253 int N_ = mBlockMultXCrop; 254 int K_ = Nstride+Cstride; 255 double ALPHA_ = 1.0; 256 const double *A_ = VinvyMultiCovCompVecs; 257 int LDA_ = Nstride+Cstride; 258 double *B_ = snpCovCompVecBlock; 259 int LDB_ = Nstride+Cstride; 260 double BETA_ = 0.0; 261 double *C_ = XtransVecsBlock; 262 int LDC_ = B * D; 263 264 DGEMM_MACRO(&TRANSA_, &TRANSB_, &M_, &N_, &K_, &ALPHA_, A_, &LDA_, B_, &LDB_, 265 &BETA_, C_, &LDC_); 266 #ifdef MEASURE_DGEMM 267 dgemmTicks += Timer::rdtsc() - tsc; 268 //dgemmTicks += timer.update_time(); 269 #endif 270 } 271 272 // (3) SNP-wise multiply XtransVecsBlock by appropriate Veg factor along with sqrt(Xscale2s) 273 for (uint64 mPlus = 0; mPlus < mBlockMultXCrop; mPlus++) { 274 uint64 m = m0+mPlus; 275 if (snpVCnums[m]) { // SNP belongs to a variance component 276 uint64 k = snpVCnums[m]; 277 for (uint64 b = 0; b < B; b++) { 278 uint64 coeffStart = mPlus*B*D + b*D; 279 rightMultiT(XtransVecsBlock + coeffStart, D, 1, VegXscales[k]); 280 double mult = (int) b == MCtrials ? 1 : -1.0/MCtrials; 281 for (uint64 di = 0; di < D; di++) 282 for (uint64 dj = 0; dj <= di; dj++) 283 deltaVegs[k](di, dj) += mult * XtransVecsBlock[coeffStart+di] * 284 XtransVecsBlock[coeffStart+dj]; 285 } 286 } 287 } 288 289 } 290 ALIGNED_FREE(XtransVecsBlock); 291 ALIGNED_FREE(work); 292 ALIGNED_FREE(snpCovCompVecBlock); 293 294 // compute env variance parameter matrix Vegs[0] updates 295 // directly use ***and destroy*** VinvyMultiCovCompVecs: need to rightMultiT by Vegs[0] 296 for (int t = 0; t <= MCtrials; t++) { 297 for (uint64 nc = 0; nc < Nstride+Cstride; nc++) // replace Vinvy with Ue-hats (env) 298 rightMultiT(VinvyMultiCovCompVecs + t*DxNC + nc, D, Nstride+Cstride, Vegs[0]); 299 double mult = t == MCtrials ? 1 : -1.0/MCtrials; 300 for (uint64 di = 0; di < D; di++) 301 for (uint64 dj = 0; dj <= di; dj++) 302 deltaVegs[0](di, dj) += mult * 303 dotCovCompVecs(VinvyMultiCovCompVecs + t*DxNC + di*(Nstride+Cstride), 304 VinvyMultiCovCompVecs + t*DxNC + dj*(Nstride+Cstride)); 305 } 306 307 for (int k = 0; k <= VCs; k++) { 308 for (uint64 di = 0; di < D; di++) 309 for (uint64 dj = 0; dj <= di; dj++) 310 deltaVegs[k](dj, di) = deltaVegs[k](di, dj); // symmetrize 311 Vegs[k] += 1/denoms[k] * deltaVegs[k]; 312 } 313 314 int numPars = (1+VCs) * D*(D+1)/2; 315 ublas::vector <double> grad(numPars); 316 int curPar = 0; 317 for (int k = 0; k <= VCs; k++) 318 for (uint64 di = 0; di < D; di++) 319 for (uint64 dj = 0; dj <= di; dj++) 320 grad(curPar++) = (di == dj ? 0.5 : 1.0) * deltaVegs[k](di, dj); 321 return grad; 322 } 323 computeMCgrad(const double VinvyMultiCovCompVecs[],uint64 D,const uchar snpVCnums[],int VCs,const vector<double> & vcXscale2s,int MCtrials) const324 ublas::vector <double> Bolt::computeMCgrad(const double VinvyMultiCovCompVecs[], uint64 D, 325 const uchar snpVCnums[], int VCs, 326 const vector <double> &vcXscale2s, int MCtrials) 327 const { 328 329 int numPars = (1+VCs) * D*(D+1)/2; 330 ublas::vector <double> grad = ublas::zero_vector <double> (numPars); 331 uint64 DxNC = D * (Nstride+Cstride); 332 uint64 B = MCtrials+1; 333 double *snpCovCompVecBlock = ALIGNED_MALLOC_DOUBLES(mBlockMultX * (Nstride+Cstride)); 334 double (*work)[4] = (double (*)[4]) ALIGNED_MALLOC(omp_get_max_threads() * 256*sizeof(*work)); 335 double *XtransVecsBlock = ALIGNED_MALLOC_DOUBLES(mBlockMultX * B * D); 336 337 // for each SNP block... 338 for (uint64 m0 = 0; m0 < M; m0 += mBlockMultX) { 339 340 // (1) load the SNPs in parallel into Xblock [Nstride+Cstride x block size] 341 uint64 mBlockMultXCrop = std::min(M, m0+mBlockMultX) - m0; 342 #pragma omp parallel for 343 for (uint64 mPlus = 0; mPlus < mBlockMultXCrop; mPlus++) { 344 uint64 m = m0+mPlus; 345 if (projMaskSnps[m] && snpVCnums[m]) // build snp vector + sign-flipped covar comps 346 buildMaskedSnpNegCovCompVec(snpCovCompVecBlock + mPlus * (Nstride+Cstride), m, 347 work + (omp_get_thread_num()<<8)); 348 else 349 memset(snpCovCompVecBlock + mPlus * (Nstride+Cstride), 0, 350 (Nstride+Cstride) * sizeof(snpCovCompVecBlock[0])); 351 } 352 353 // (2) multiply Xblock' [block size x Nstride+Cstride] * xMultiVecs [Nstride+Cstride x BxD] 354 // = XtransVecsBlock [block size x BxD] 355 // (note that Xblock' has NEG CCVecs while xVecs has POS CCVecs) 356 { 357 #ifdef MEASURE_DGEMM 358 //Timer timer; 359 unsigned long long tsc = Timer::rdtsc(); 360 #endif 361 char TRANSA_ = 'T'; 362 char TRANSB_ = 'N'; 363 int M_ = B * D; 364 int N_ = mBlockMultXCrop; 365 int K_ = Nstride+Cstride; 366 double ALPHA_ = 1.0; 367 const double *A_ = VinvyMultiCovCompVecs; 368 int LDA_ = Nstride+Cstride; 369 double *B_ = snpCovCompVecBlock; 370 int LDB_ = Nstride+Cstride; 371 double BETA_ = 0.0; 372 double *C_ = XtransVecsBlock; 373 int LDC_ = B * D; 374 375 DGEMM_MACRO(&TRANSA_, &TRANSB_, &M_, &N_, &K_, &ALPHA_, A_, &LDA_, B_, &LDB_, 376 &BETA_, C_, &LDC_); 377 #ifdef MEASURE_DGEMM 378 dgemmTicks += Timer::rdtsc() - tsc; 379 //dgemmTicks += timer.update_time(); 380 #endif 381 } 382 383 // (3) SNP-wise multiply XtransVecsBlock by appropriate Veg factor along with sqrt(Xscale2s) 384 for (uint64 mPlus = 0; mPlus < mBlockMultXCrop; mPlus++) { 385 uint64 m = m0+mPlus; 386 if (snpVCnums[m]) { // SNP belongs to a variance component 387 uint64 k = snpVCnums[m]; 388 for (uint64 b = 0; b < B; b++) { 389 uint64 coeffStart = mPlus*B*D + b*D; 390 double mult = ((int) b == MCtrials ? 1 : -1.0/MCtrials) * vcXscale2s[k]; 391 int curPar = k*D*(D+1)/2; 392 for (uint64 di = 0; di < D; di++) 393 for (uint64 dj = 0; dj <= di; dj++) 394 grad(curPar++) += (di == dj ? 0.5 : 1.0) * mult * XtransVecsBlock[coeffStart+di] * 395 XtransVecsBlock[coeffStart+dj]; 396 } 397 } 398 } 399 400 } 401 ALIGNED_FREE(XtransVecsBlock); 402 ALIGNED_FREE(work); 403 ALIGNED_FREE(snpCovCompVecBlock); 404 405 // compute grad components for env variance parameters 406 for (int t = 0; t <= MCtrials; t++) { 407 double mult = t == MCtrials ? 1 : -1.0/MCtrials; 408 int curPar = 0; 409 for (uint64 di = 0; di < D; di++) 410 for (uint64 dj = 0; dj <= di; dj++) 411 grad(curPar++) += (di == dj ? 0.5 : 1.0) * mult * 412 dotCovCompVecs(VinvyMultiCovCompVecs + t*DxNC + di*(Nstride+Cstride), 413 VinvyMultiCovCompVecs + t*DxNC + dj*(Nstride+Cstride)); 414 } 415 416 return grad; 417 } 418 419 /** 420 * computes Vmulti * X for a batch of ((N+C)D)-vectors X (each a stack of D (N+C)-vectors), where 421 * Vmulti = kron(VegXscale2s[0], I) + SUM_k=1^VCs kron(VegXscale2s[k], X_k*X_k') 422 * and VegXscale2s = Veg .* [1 vcXscale2s[1]..vcXscale2s[VCs]] 423 * 424 * VmultiCovCompVecs: (out) B x D x (Nstride+Cstride) 425 * xMultiCovCompVecs: (in) B x D x (Nstride+Cstride) 426 * VegXscale2s: (in) (1+VCs) x DxD combined scale factors for easy application 427 */ multVmulti(double VmultiCovCompVecs[],const double xMultiCovCompVecs[],const uchar snpVCnums[],const vector<ublas::matrix<double>> & VegXscale2s,uint64 B) const428 void Bolt::multVmulti(double VmultiCovCompVecs[], const double xMultiCovCompVecs[], 429 const uchar snpVCnums[], 430 const vector <ublas::matrix <double> > &VegXscale2s, uint64 B) const { 431 432 // note: algorithm is more efficient than from original multH: 433 // for each block, it performs multXtrans, scales the results, and immediately does multX 434 // this way, SNPs only need to be loaded once, and all VCs can be done at once 435 436 uint64 D = VegXscale2s[0].size1(); 437 uint64 DxNC = D * (Nstride+Cstride); 438 memcpy(VmultiCovCompVecs, xMultiCovCompVecs, B * DxNC * sizeof(VmultiCovCompVecs[0])); 439 // initialize each answer vec Vmulti to X (NC x D) * Ve (D x D) for environment/noise VC 440 for (uint64 b = 0; b < B; b++) 441 for (uint64 nc = 0; nc < Nstride+Cstride; nc++) 442 rightMultiT(VmultiCovCompVecs + b*DxNC + nc, D, Nstride+Cstride, VegXscale2s[0]); 443 444 double *snpCovCompVecBlock = ALIGNED_MALLOC_DOUBLES(mBlockMultX * (Nstride+Cstride)); 445 double (*work)[4] = (double (*)[4]) ALIGNED_MALLOC(omp_get_max_threads() * 256*sizeof(*work)); 446 double *XtransVecsBlock = ALIGNED_MALLOC_DOUBLES(mBlockMultX * B * D); 447 448 // for each SNP block... 449 for (uint64 m0 = 0; m0 < M; m0 += mBlockMultX) { 450 451 // (1) load the SNPs in parallel into Xblock [Nstride+Cstride x block size] 452 uint64 mBlockMultXCrop = std::min(M, m0+mBlockMultX) - m0; 453 #pragma omp parallel for 454 for (uint64 mPlus = 0; mPlus < mBlockMultXCrop; mPlus++) { 455 uint64 m = m0+mPlus; 456 if (projMaskSnps[m] && snpVCnums[m]) // build snp vector + sign-flipped covar comps 457 buildMaskedSnpNegCovCompVec(snpCovCompVecBlock + mPlus * (Nstride+Cstride), m, 458 work + (omp_get_thread_num()<<8)); 459 else 460 memset(snpCovCompVecBlock + mPlus * (Nstride+Cstride), 0, 461 (Nstride+Cstride) * sizeof(snpCovCompVecBlock[0])); 462 } 463 464 // (2) multiply Xblock' [block size x Nstride+Cstride] * xMultiVecs [Nstride+Cstride x BxD] 465 // = XtransVecsBlock [block size x BxD] 466 // (note that Xblock' has NEG CCVecs while xVecs has POS CCVecs) 467 { 468 #ifdef MEASURE_DGEMM 469 //Timer timer; 470 unsigned long long tsc = Timer::rdtsc(); 471 #endif 472 char TRANSA_ = 'T'; 473 char TRANSB_ = 'N'; 474 int M_ = B * D; 475 int N_ = mBlockMultXCrop; 476 int K_ = Nstride+Cstride; 477 double ALPHA_ = 1.0; 478 const double *A_ = xMultiCovCompVecs; 479 int LDA_ = Nstride+Cstride; 480 double *B_ = snpCovCompVecBlock; 481 int LDB_ = Nstride+Cstride; 482 double BETA_ = 0.0; 483 double *C_ = XtransVecsBlock; 484 int LDC_ = B * D; 485 486 DGEMM_MACRO(&TRANSA_, &TRANSB_, &M_, &N_, &K_, &ALPHA_, A_, &LDA_, B_, &LDB_, 487 &BETA_, C_, &LDC_); 488 #ifdef MEASURE_DGEMM 489 dgemmTicks += Timer::rdtsc() - tsc; 490 //dgemmTicks += timer.update_time(); 491 #endif 492 } 493 494 // (3) SNP-wise multiply XtransVecsBlock by appropriate VegXscale2 factors 495 for (uint64 mPlus = 0; mPlus < mBlockMultXCrop; mPlus++) { 496 uint64 m = m0+mPlus; 497 if (snpVCnums[m]) { // SNP belongs to a variance component 498 uint64 k = snpVCnums[m]; 499 for (uint64 b = 0; b < B; b++) 500 rightMultiT(XtransVecsBlock + mPlus*B*D + b*D, D, 1, VegXscale2s[k]); 501 } 502 } 503 504 // note that Xblock was loaded with NEG CCVecs; now we need to sign-flip CCVecs 505 for (uint64 mPlus = 0; mPlus < mBlockMultXCrop; mPlus++) { 506 uint64 m = m0+mPlus; 507 if (projMaskSnps[m] && snpVCnums[m]) // sign-flip covar comps 508 for (uint64 c = 0; c < Cstride; c++) 509 snpCovCompVecBlock[mPlus * (Nstride+Cstride) + Nstride + c] *= -1; 510 } 511 512 // (4) multiply Xblock [NCstride x block size] * scaled XtransVecsBlock [block size x B] 513 // = block's contribution to SUM_k (scale(k) X_k X_k') * xVecs [NCstride x B] 514 // directly accumulate results in answer Wmult [NCstride x B] 515 { 516 #ifdef MEASURE_DGEMM 517 //Timer timer; 518 unsigned long long tsc = Timer::rdtsc(); 519 #endif 520 char TRANSA_ = 'N'; 521 char TRANSB_ = 'T'; 522 int M_ = Nstride+Cstride; 523 int N_ = B * D; 524 int K_ = mBlockMultXCrop; 525 double ALPHA_ = 1.0; 526 double *A_ = snpCovCompVecBlock; 527 int LDA_ = Nstride+Cstride; 528 const double *B_ = XtransVecsBlock; 529 int LDB_ = B * D; 530 double BETA_ = 1.0; 531 double *C_ = VmultiCovCompVecs; 532 int LDC_ = Nstride+Cstride; 533 DGEMM_MACRO(&TRANSA_, &TRANSB_, &M_, &N_, &K_, &ALPHA_, A_, &LDA_, B_, &LDB_, 534 &BETA_, C_, &LDC_); 535 #ifdef MEASURE_DGEMM 536 dgemmTicks += Timer::rdtsc() - tsc; 537 //dgemmTicks += timer.update_time(); 538 #endif 539 } 540 541 } 542 ALIGNED_FREE(XtransVecsBlock); 543 ALIGNED_FREE(work); 544 ALIGNED_FREE(snpCovCompVecBlock); 545 } 546 547 /** 548 * solves a batch of B equations 549 * Vmulti * vec(X) = b 550 * where 551 * Vmulti = kron(Vegs[0], I) + SUM_k=1^VCs kron(Vegs[k], vcXscale2s[k] * X_k*X_k') 552 * and vcXscale2s[k] = 1/("M_k" = sum(Xnorm2s[snpVCnums==k])/(Nused-Cindep)) 553 * (and as usual, projections of x, b, and columns of X are implicitly represented via covComps) 554 * 555 * xMultiCovCompVecs: (in/out) B x D x Nstride+Cstride 556 * useStartVecs: use input xMultiCovCompVecs as initial guess for iteration 557 * bMultiCovCompVecs: (in) B x D x Nstride+Cstride 558 * Vegs: (in) (1+VCs) x DxD covariance parameters 559 */ conjGradSolveVmulti(double xMultiCovCompVecs[],bool useStartVecs,const double bMultiCovCompVecs[],uint64 B,const uchar snpVCnums[],const vector<double> & vcXscale2s,const vector<ublas::matrix<double>> & Vegs,int maxIters,double CGtol) const560 void Bolt::conjGradSolveVmulti(double xMultiCovCompVecs[], bool useStartVecs, 561 const double bMultiCovCompVecs[], uint64 B, 562 const uchar snpVCnums[], const vector <double> &vcXscale2s, 563 const vector < ublas::matrix <double> > &Vegs, int maxIters, 564 double CGtol) const { 565 566 int VCs = Vegs.size()-1; 567 uint64 D = Vegs[0].size1(); 568 569 // combine Vegs and vcXscale2s into VegXscale2s 570 vector < ublas::matrix <double> > VegXscale2s(1+VCs); 571 VegXscale2s[0] = Vegs[0]; 572 for (int k = 1; k <= VCs; k++) 573 VegXscale2s[k] = vcXscale2s[k] * Vegs[k]; 574 575 #ifdef VERBOSE 576 Timer timer; 577 cout << " Batch-solving " << B << " systems of equations using conjugate gradient iteration" 578 << endl; 579 #endif 580 #ifdef MEASURE_DGEMM 581 unsigned long long tscStart = Timer::rdtsc(); 582 dgemmTicks = 0; 583 #endif 584 585 const uint64 BxDxNC = B * D * (Nstride+Cstride); 586 587 vector <double> r2orig(B), r2olds(B), r2news(B); 588 computeMultiProjNorm2s(&r2orig[0], bMultiCovCompVecs, D, B); 589 590 double *rMultiCovCompVecs = ALIGNED_MALLOC_DOUBLES(BxDxNC); 591 double *VmultiCovCompVecs = ALIGNED_MALLOC_DOUBLES(BxDxNC); 592 if (useStartVecs) { 593 multVmulti(VmultiCovCompVecs, xMultiCovCompVecs, snpVCnums, VegXscale2s, B); // V*x 594 for (uint64 bdnc = 0; bdnc < BxDxNC; bdnc++) 595 rMultiCovCompVecs[bdnc] = bMultiCovCompVecs[bdnc] - VmultiCovCompVecs[bdnc]; // r=b-V*x 596 computeMultiProjNorm2s(&r2olds[0], rMultiCovCompVecs, D, B); // rsold=r'*r 597 } 598 else { // starting at x=0 599 memset(xMultiCovCompVecs, 0, BxDxNC * sizeof(rMultiCovCompVecs[0])); 600 memcpy(rMultiCovCompVecs, bMultiCovCompVecs, BxDxNC * sizeof(rMultiCovCompVecs[0])); 601 r2olds = r2orig; 602 } 603 604 double *pMultiCovCompVecs = ALIGNED_MALLOC_DOUBLES(BxDxNC); 605 memcpy(pMultiCovCompVecs, rMultiCovCompVecs, BxDxNC * sizeof(pMultiCovCompVecs[0])); // p=r 606 607 for (int iter = 0; iter < maxIters; iter++) { 608 multVmulti(VmultiCovCompVecs, pMultiCovCompVecs, snpVCnums, VegXscale2s, B); // V*p 609 610 for (uint64 bdnc = 0, b = 0; b < B; b++) { 611 double *p = pMultiCovCompVecs + b * D*(Nstride+Cstride); 612 double *Vp = VmultiCovCompVecs + b * D*(Nstride+Cstride); 613 614 // alpha=rsold/(p'*Ap) 615 double alpha = r2olds[b] / dotMultiCovCompVecs(p, Vp, D); 616 617 for (uint64 dnc = 0; dnc < D*(Nstride+Cstride); dnc++, bdnc++) { 618 xMultiCovCompVecs[bdnc] += alpha * pMultiCovCompVecs[bdnc]; //x=x+alpha*p 619 rMultiCovCompVecs[bdnc] -= alpha * VmultiCovCompVecs[bdnc]; //r=r-alpha*Ap 620 } 621 } 622 623 computeMultiProjNorm2s(&r2news[0], rMultiCovCompVecs, D, B); // rsnew=r'*r 624 625 #ifdef VERBOSE 626 double min_rRatio = 1e9, max_rRatio = 0; 627 for (uint64 b = 0; b < B; b++) { 628 double rRatio = sqrt(r2news[b] / r2orig[b]); 629 min_rRatio = std::min(rRatio, min_rRatio); 630 max_rRatio = std::max(rRatio, max_rRatio); 631 } 632 633 vector <double> resNorm2s(B); 634 computeMultiProjNorm2s(&resNorm2s[0], xMultiCovCompVecs, D, B); 635 636 printf(" iter %d: time=%.2f rNorms/orig: (%.1g,%.1g) res2s: %g..%g\n", 637 iter+1, timer.update_time(), min_rRatio, max_rRatio, resNorm2s[0], resNorm2s[B-1]); 638 fflush(stdout); 639 #endif 640 641 // check convergence 642 bool converged = true; 643 for (uint64 b = 0; b < B; b++) 644 if (sqrt(r2news[b] / r2orig[b]) > CGtol) 645 converged = false; 646 if (converged) { 647 cout << " Converged at iter " << iter+1 << ": rNorms/orig all < CGtol=" << CGtol << endl; 648 break; 649 } 650 651 for (uint64 bdnc = 0, b = 0; b < B; b++) { 652 double r2ratio = r2news[b] / r2olds[b]; 653 for (uint64 dnc = 0; dnc < D*(Nstride+Cstride); dnc++, bdnc++) // p=r+rsnew/rsold*p 654 pMultiCovCompVecs[bdnc] = rMultiCovCompVecs[bdnc] + r2ratio * pMultiCovCompVecs[bdnc]; 655 } 656 657 r2olds = r2news; // rsold=rsnew 658 } 659 660 ALIGNED_FREE(pMultiCovCompVecs); 661 ALIGNED_FREE(VmultiCovCompVecs); 662 ALIGNED_FREE(rMultiCovCompVecs); 663 #ifdef MEASURE_DGEMM 664 double dgemmPct = 100 * dgemmTicks / (double) (Timer::rdtsc()-tscStart); 665 printf(" Time breakdown: dgemm = %.1f%%, memory/overhead = %.1f%%\n", dgemmPct, 100-dgemmPct); 666 fflush(stdout); 667 #endif 668 } 669 670 /** 671 * creates yRandsData (MCtrials+1) MultiCovCompVecs as linear combinations of yEnvGenUnscaled 672 * viewing yEnvGenUnscaled as (1+VCs) blocks of size (MCtrials+1) x D x (Nstride+Cstride), 673 * yRandsData = lin. comb. of these blocks, where each NCxD chunk is right-mult by chol(Vegs)' 674 * (note that the last rep is the data and just gets yEnvGenUnscaled(0,MCtrials,:) copied in) 675 * 676 * yRandsDataMultiCovCompVecs: (out) (MCtrials+1) x D x (Nstride+Cstride) 677 * yEnvGenUnscaledMultiCovCompVecs: (in) (1+VCs) x (MCtrials+1) x D x (Nstride+Cstride) 678 * 679 */ combineEnvGenMultiCovCompVecs(double yRandsDataMultiCovCompVecs[],const double yEnvGenUnscaledMultiCovCompVecs[],const vector<ublas::matrix<double>> & Vegs,int MCtrials) const680 void Bolt::combineEnvGenMultiCovCompVecs(double yRandsDataMultiCovCompVecs[], 681 const double yEnvGenUnscaledMultiCovCompVecs[], 682 const vector < ublas::matrix <double> > &Vegs, 683 int MCtrials) const { 684 685 int VCs = Vegs.size()-1; 686 uint64 D = Vegs[0].size1(); 687 vector < ublas::matrix <double> > cholVegs(1+VCs); 688 for (int k = 0; k <= VCs; k++) 689 cholVegs[k] = MatrixUtils::chol(Vegs[k]); 690 uint64 DxNC = D * (Nstride+Cstride); 691 uint64 blockSize = (MCtrials+1) * DxNC; 692 memcpy(yRandsDataMultiCovCompVecs, yEnvGenUnscaledMultiCovCompVecs, // copy in env terms 693 blockSize * sizeof(yRandsDataMultiCovCompVecs[0])); 694 vector <double> tmpD(D); 695 for (int t = 0; t < MCtrials; t++) 696 for (uint64 nc = 0; nc < Nstride+Cstride; nc++) { 697 // apply chol' to env terms currently in yRands (but not data rep) 698 rightMultiT(yRandsDataMultiCovCompVecs + t*DxNC + nc, D, Nstride+Cstride, cholVegs[0]); 699 // apply chol' to gen terms and accumulate in yRands 700 for (int k = 1; k <= VCs; k++) { 701 const double *yGenUnscaledMultiCCVecs_k = yEnvGenUnscaledMultiCovCompVecs + k*blockSize; 702 for (uint64 d = 0; d < D; d++) 703 tmpD[d] = yGenUnscaledMultiCCVecs_k[t*DxNC + d*(Nstride+Cstride) + nc]; 704 rightMultiT(&tmpD[0], D, 1, cholVegs[k]); 705 for (uint64 d = 0; d < D; d++) 706 yRandsDataMultiCovCompVecs[t*DxNC + d*(Nstride+Cstride) + nc] += tmpD[d]; 707 } 708 } 709 } 710 711 /** 712 * TODO: update comments (basically the same as single-trait version but with MCtrials x D rands) 713 * 714 * generates 1+VCs blocks of MCtrials+1 (rand+data) component vecs for building MCreml phenotypes 715 * later, blocks will be combined with coeffs 1, rho_1, ..., rho_VCs 716 * components are pre-scaled only with sqrt(vcXscale2s) (~1/sqrt(M_k)), putting Xs on same scale 717 * 718 * yEnvGenUnscaledCovCompVecs: (out) (1+VCs) x (MCtrials+1) x (Nstride+Cstride) 719 * env gen rands data 720 * data rep: y 000 721 * 722 * - projecting out covariates: implicitly done by covComps 723 * - applying maskIndivs: automatically done to snps (=> Gen component) by buildMaskedSnpVector 724 * needs to be applied to EnvUnscaled component 725 * - data layout: 1+VCs batches of {MCtrials rand reps + 1 data rep} 726 * - first batch holds Env components 727 * - remaining VCs batches hold Gen components 728 * - in each batch: 729 * - 0..MCtrials-1: Env, Gen components of random phenotypes 730 * - MCtrials: Env = pheno from data; Gen = 0 731 * 732 * pheno: (in) real phenotype (data rep), possibly of size N or zero-filled beyond (no covComps) 733 * snpVCnums: (in) M-vector of assignments of SNPs to VCs (0 -> ignore; 1..VCs -> var comps) 734 * VCs: (in) number of non-identity VCs 735 * vcXscale2s: (in) (VCs+1)-vector of squared scale factors that normalize X's (ignore 0th entry) 736 * 737 * return: phenotype normalizations and correlations 738 */ genUnscaledMultiCovCompVecs(double yEnvGenUnscaledMultiCovCompVecs[],const vector<vector<double>> & phenos,const uchar snpVCnums[],int VCs,const vector<double> & vcXscale2s,int MCtrials,int seed) const739 ublas::matrix <double> Bolt::genUnscaledMultiCovCompVecs 740 (double yEnvGenUnscaledMultiCovCompVecs[], const vector < vector <double> > &phenos, 741 const uchar snpVCnums[], int VCs, const vector <double> &vcXscale2s, int MCtrials, int seed) 742 const { 743 744 boost::mt19937 rng(seed+54321); 745 boost::variate_generator<boost::mt19937&, boost::normal_distribution<> > 746 randn(rng, boost::normal_distribution<>(0.0, 1.0)); 747 748 uint64 D = phenos.size(); 749 uint64 DxNC = D * (Nstride+Cstride); 750 751 // return matrix: phenotype normalizations and corrs 752 ublas::matrix <double> phenoNormsCorrs = ublas::zero_matrix <double> (D, D); 753 754 // zero-initialize all vectors 755 memset(yEnvGenUnscaledMultiCovCompVecs, 0, 756 (1+VCs) * (MCtrials+1) * DxNC * sizeof(yEnvGenUnscaledMultiCovCompVecs[0])); 757 758 // put pheno in Env (first=0 of 1+VCs) block, MCtrials (last=MCtrials of MCtrials+1) DxNC vec 759 double *phenoCovCompVecs = yEnvGenUnscaledMultiCovCompVecs + MCtrials * DxNC; 760 for (uint64 d = 0; d < D; d++) { 761 double *phenoCovCompVecs_d = phenoCovCompVecs + d*(Nstride+Cstride); 762 memcpy(phenoCovCompVecs_d, &phenos[d][0], phenos[d].size() * sizeof(phenoCovCompVecs[0])); 763 covBasis.applyMaskIndivs(phenoCovCompVecs_d); 764 covBasis.computeCindepComponents(phenoCovCompVecs_d + Nstride, phenoCovCompVecs_d); 765 // rescale phenos to have norm approximately 1 766 double scale = sqrt((Nused-Cindep) / computeProjNorm2(phenoCovCompVecs_d)); 767 phenoNormsCorrs(d, d) = 1/scale; 768 for (uint nc = 0; nc < Nstride+Cstride; nc++) 769 phenoCovCompVecs_d[nc] *= scale; 770 } 771 for (uint64 i = 0; i < D; i++) 772 for (uint64 j = i+1; j < D; j++) 773 phenoNormsCorrs(i, j) = phenoNormsCorrs(j, i) = 774 dotCovCompVecs(phenoCovCompVecs + i*(Nstride+Cstride), 775 phenoCovCompVecs + j*(Nstride+Cstride)) / (Nused-Cindep); 776 777 // generate yGen: VCs x (MCtrials+1) x DxNC with rand betas generated on-the-fly! 778 double *snpCovCompVec = ALIGNED_MALLOC_DOUBLES(Nstride+Cstride); 779 double (*work)[4] = (double (*)[4]) ALIGNED_MALLOC(256*sizeof(*work)); 780 double *betas_m = ALIGNED_MALLOC_DOUBLES(MCtrials*D); 781 for (uint64 m = 0; m < M; m++) 782 if (projMaskSnps[m] && snpVCnums[m]) { 783 // loop through SNPs; load just 1 (Nstride+Cstride) SNP at a time 784 buildMaskedSnpCovCompVec(snpCovCompVec, m, work); 785 for (int t = 0; t < MCtrials * (int) D; t++) // generate MCtrials*D random betas 786 betas_m[t] = randn(); 787 // DGER to update MCtrials x D x (Nstride+Cstride) block *for appropriate VC* 788 uint64 k = snpVCnums[m]; 789 double *yGenBlock = yEnvGenUnscaledMultiCovCompVecs + k * (MCtrials+1) * DxNC; 790 // update yGenBlock: add betas_m * snpCovCompVec (DGER) 791 { 792 int M_ = Nstride+Cstride; 793 int N_ = MCtrials*D; 794 double ALPHA_ = sqrt(vcXscale2s[k]); // normalize 1/sqrt("M"=sum(Xnorm2s)/(Nused-Cindep)) 795 double *X_ = snpCovCompVec; 796 int INCX_ = 1; 797 double *Y_ = betas_m; 798 int INCY_ = 1; 799 double *A_ = yGenBlock; 800 int LDA_ = Nstride+Cstride; 801 DGER_MACRO(&M_, &N_, &ALPHA_, X_, &INCX_, Y_, &INCY_, A_, &LDA_); 802 } 803 } 804 ALIGNED_FREE(betas_m); 805 ALIGNED_FREE(work); 806 ALIGNED_FREE(snpCovCompVec); 807 808 // generate yEnv: first MCtrials x (Nstride+Cstride) block of output array 809 // (last 1 x (Nstride+Cstride) is real data phenotype) 810 for (int t = 0; t < MCtrials; t++) { 811 // EnvUnscaled: epsCovCompVec <- N randn, after the following processing... 812 // - mask out maskIndivs (=> norm2 ~ Nused) 813 // - compute covComps: implicitly project out covars (=> norm2-SUM(comps2) ~ Nused-Cindep) 814 for (uint64 d = 0; d < D; d++) { 815 double *randnEpsCovCompVec = yEnvGenUnscaledMultiCovCompVecs + t*DxNC+d*(Nstride+Cstride); 816 for (uint64 n = 0; n < Nstride; n++) 817 if (maskIndivs[n]) 818 randnEpsCovCompVec[n] = randn(); 819 // no need to zero out components after Cindep: already 0-initialized 820 covBasis.computeCindepComponents(randnEpsCovCompVec + Nstride, randnEpsCovCompVec); 821 } 822 } 823 824 return phenoNormsCorrs; 825 } 826 updateVegsAI(vector<ublas::matrix<double>> & Vegs,const uchar snpVCnums[],const vector<double> & vcXscale2s,const double yRandsDataMultiCovCompVecs[],int MCtrials,int CGmaxIters,double CGtol) const827 void Bolt::updateVegsAI(vector < ublas::matrix <double> > &Vegs, const uchar snpVCnums[], 828 const vector <double> &vcXscale2s, 829 const double yRandsDataMultiCovCompVecs[], int MCtrials, int CGmaxIters, 830 double CGtol) const { 831 832 int VCs = Vegs.size()-1; 833 uint64 D = Vegs[0].size1(); 834 uint64 DxNC = D * (Nstride+Cstride); 835 836 double *VinvyRandsDataMultiCovCompVecs = ALIGNED_MALLOC_DOUBLES((MCtrials+1) * DxNC); 837 838 conjGradSolveVmulti(VinvyRandsDataMultiCovCompVecs, false, yRandsDataMultiCovCompVecs, 839 MCtrials+1, snpVCnums, vcXscale2s, Vegs, CGmaxIters, CGtol); 840 841 // compute gradient 842 vector < ublas::matrix <double> > identityVegs(1+VCs, ublas::identity_matrix <double> (D)); 843 ublas::vector <double> grad = updateVegs(identityVegs, VinvyRandsDataMultiCovCompVecs, 844 snpVCnums, vcXscale2s, MCtrials); 845 cout << "grad" << grad << endl; 846 847 // compute AI matrix 848 849 const double *VinvyMultiCovCompVecs = VinvyRandsDataMultiCovCompVecs + MCtrials * DxNC; 850 851 double *ThetasVinvyMultiCovCompVecs = ALIGNED_MALLOC_DOUBLES(VCs * DxNC); 852 853 multThetaMinusIs(ThetasVinvyMultiCovCompVecs, VinvyMultiCovCompVecs, snpVCnums, vcXscale2s, 854 D, 0); 855 856 int numPars = (1+VCs) * D*(D+1)/2; 857 double *dVdparsVinvyMultiCovCompVecs = ALIGNED_MALLOC_DOUBLES(numPars * DxNC); 858 859 memset(dVdparsVinvyMultiCovCompVecs, 0, numPars*DxNC*sizeof(dVdparsVinvyMultiCovCompVecs[0])); 860 int curPar = 0; 861 for (int k = 0; k <= VCs; k++) { // populate dVdparsVinvyMultiCovCompVecs with computed vecs 862 const double *ThetasVinvyMultiCCVecs_k = k == 0 ? VinvyMultiCovCompVecs : 863 ThetasVinvyMultiCovCompVecs + (k-1) * DxNC; 864 for (uint64 di = 0; di < D; di++) 865 for (uint64 dj = 0; dj <= di; dj++) { 866 memcpy(dVdparsVinvyMultiCovCompVecs + curPar * DxNC + dj*(Nstride+Cstride), 867 ThetasVinvyMultiCCVecs_k + di*(Nstride+Cstride), 868 (Nstride+Cstride)*sizeof(ThetasVinvyMultiCCVecs_k[0])); 869 if (di != dj) 870 memcpy(dVdparsVinvyMultiCovCompVecs + curPar * DxNC + di*(Nstride+Cstride), 871 ThetasVinvyMultiCCVecs_k + dj*(Nstride+Cstride), 872 (Nstride+Cstride)*sizeof(ThetasVinvyMultiCCVecs_k[0])); 873 curPar++; 874 } 875 } 876 877 double *VinvdVdparsVinvyMultiCovCompVecs = ALIGNED_MALLOC_DOUBLES(numPars * DxNC); 878 879 conjGradSolveVmulti(VinvdVdparsVinvyMultiCovCompVecs, false, dVdparsVinvyMultiCovCompVecs, 880 numPars, snpVCnums, vcXscale2s, Vegs, CGmaxIters, CGtol); 881 882 ublas::matrix <double> AI = ublas::zero_matrix <double> (numPars, numPars); 883 884 for (int pi = 0; pi < numPars; pi++) 885 for (int pj = 0; pj < numPars; pj++) 886 AI(pi, pj) = pj < pi ? AI(pj, pi) : 887 -0.5 * dotMultiCovCompVecs(VinvdVdparsVinvyMultiCovCompVecs + pi * DxNC, 888 dVdparsVinvyMultiCovCompVecs + pj * DxNC, D); 889 cout << "AI" << AI << endl; 890 891 ublas::vector <double> step = -MatrixUtils::linSolve(AI, grad); 892 cout << "step" << step << endl; 893 894 curPar = 0; 895 for (int k = 0; k <= VCs; k++) 896 for (uint64 di = 0; di < D; di++) 897 for (uint64 dj = 0; dj <= di; dj++) { 898 Vegs[k](di, dj) += step(curPar++); 899 Vegs[k](dj, di) = Vegs[k](di, dj); 900 } 901 902 ALIGNED_FREE(VinvdVdparsVinvyMultiCovCompVecs); 903 ALIGNED_FREE(dVdparsVinvyMultiCovCompVecs); 904 ALIGNED_FREE(ThetasVinvyMultiCovCompVecs); 905 ALIGNED_FREE(VinvyRandsDataMultiCovCompVecs); 906 } 907 computeAI(const vector<ublas::matrix<double>> & Vegs,const double VinvyMultiCovCompVecs[],const uchar snpVCnums[],const vector<double> & vcXscale2s,int CGmaxIters,double CGtol) const908 ublas::matrix <double> Bolt::computeAI(const vector < ublas::matrix <double> > &Vegs, 909 const double VinvyMultiCovCompVecs[], 910 const uchar snpVCnums[], 911 const vector <double> &vcXscale2s, int CGmaxIters, 912 double CGtol) const { 913 914 int VCs = Vegs.size()-1; 915 uint64 D = Vegs[0].size1(); 916 uint64 DxNC = D * (Nstride+Cstride); 917 918 double *ThetasVinvyMultiCovCompVecs = ALIGNED_MALLOC_DOUBLES(VCs * DxNC); 919 920 multThetaMinusIs(ThetasVinvyMultiCovCompVecs, VinvyMultiCovCompVecs, snpVCnums, vcXscale2s, 921 D, 0); 922 923 int numPars = (1+VCs) * D*(D+1)/2; 924 double *dVdparsVinvyMultiCovCompVecs = ALIGNED_MALLOC_DOUBLES(numPars * DxNC); 925 926 memset(dVdparsVinvyMultiCovCompVecs, 0, numPars*DxNC*sizeof(dVdparsVinvyMultiCovCompVecs[0])); 927 int curPar = 0; 928 for (int k = 0; k <= VCs; k++) { // populate dVdparsVinvyMultiCovCompVecs with computed vecs 929 const double *ThetasVinvyMultiCCVecs_k = k == 0 ? VinvyMultiCovCompVecs : 930 ThetasVinvyMultiCovCompVecs + (k-1) * DxNC; 931 for (uint64 di = 0; di < D; di++) 932 for (uint64 dj = 0; dj <= di; dj++) { 933 memcpy(dVdparsVinvyMultiCovCompVecs + curPar * DxNC + dj*(Nstride+Cstride), 934 ThetasVinvyMultiCCVecs_k + di*(Nstride+Cstride), 935 (Nstride+Cstride)*sizeof(ThetasVinvyMultiCCVecs_k[0])); 936 if (di != dj) 937 memcpy(dVdparsVinvyMultiCovCompVecs + curPar * DxNC + di*(Nstride+Cstride), 938 ThetasVinvyMultiCCVecs_k + dj*(Nstride+Cstride), 939 (Nstride+Cstride)*sizeof(ThetasVinvyMultiCCVecs_k[0])); 940 curPar++; 941 } 942 } 943 944 double *VinvdVdparsVinvyMultiCovCompVecs = ALIGNED_MALLOC_DOUBLES(numPars * DxNC); 945 946 conjGradSolveVmulti(VinvdVdparsVinvyMultiCovCompVecs, false, dVdparsVinvyMultiCovCompVecs, 947 numPars, snpVCnums, vcXscale2s, Vegs, CGmaxIters, CGtol); 948 949 ublas::matrix <double> AI = ublas::zero_matrix <double> (numPars, numPars); 950 951 for (int pi = 0; pi < numPars; pi++) 952 for (int pj = 0; pj < numPars; pj++) 953 AI(pi, pj) = pj < pi ? AI(pj, pi) : 954 0.5 * dotMultiCovCompVecs(VinvdVdparsVinvyMultiCovCompVecs + pi * DxNC, 955 dVdparsVinvyMultiCovCompVecs + pj * DxNC, D); 956 957 ALIGNED_FREE(VinvdVdparsVinvyMultiCovCompVecs); 958 ALIGNED_FREE(dVdparsVinvyMultiCovCompVecs); 959 ALIGNED_FREE(ThetasVinvyMultiCovCompVecs); 960 961 return AI; 962 } 963 964 /** 965 * pheno: (in) real phenotype (data rep), possibly of size N or zero-filled beyond (no covComps) 966 * snpVCnums: (in) M-vector of assignments of SNPs to VCs (0 -> ignore; 1..VCs -> var comps) 967 */ remlAI(vector<ublas::matrix<double>> & Vegs,bool usePhenoCorrs,const vector<vector<double>> & phenos,const uchar snpVCnums[],int MCtrialsCoarse,int MCtrialsFine,int CGmaxIters,double CGtol,int seed) const968 void Bolt::remlAI(vector < ublas::matrix <double> > &Vegs, bool usePhenoCorrs, 969 const vector < vector <double> > &phenos, const uchar snpVCnums[], 970 int MCtrialsCoarse, int MCtrialsFine, int CGmaxIters, double CGtol, 971 int seed) const { 972 973 // determine number of VCs and scale factors vcXscale2s = 1 / "M_k": 974 // view each X_k as X_k * 1/sqrt("M_k" = sum(Xnorm2s)/(Nused-Cindep)); 975 // then all 1+VCs var comps (inc. identity) are on same footing 976 int VCs = 0; vector <double> vcXscale2s(1, 1); 977 for (uint64 m = 0; m < M; m++) { 978 if (projMaskSnps[m] && snpVCnums[m] > VCs) { 979 VCs = snpVCnums[m]; 980 vcXscale2s.resize(VCs+1); 981 } 982 if (projMaskSnps[m] && snpVCnums[m]) 983 vcXscale2s[snpVCnums[m]] += Xnorm2s[m]; 984 } 985 for (int k = 1; k <= VCs; k++) 986 vcXscale2s[k] = (Nused-Cindep)/vcXscale2s[k]; 987 988 if (VCs != (int) Vegs.size() - 1) { 989 cerr << "ERROR: # of VCs represented in non-masked SNPs does not match # in model" << endl; 990 cerr << " Did a variance component lose all of its SNPs during Bolt QC?" << endl; 991 exit(1); 992 } 993 994 uint64 D = phenos.size(); 995 uint64 DxNC = D * (Nstride+Cstride); 996 int numPars = (1+VCs) * D*(D+1)/2; 997 998 ublas::matrix <double> AI; 999 ublas::matrix <double> phenoNormsCorrs; 1000 int MCtrials; 1001 double tolLL; 1002 1003 for (int phase = 0; phase < 2; phase++) { 1004 cout << endl 1005 << "===============================================================================" 1006 << endl << endl; 1007 if (phase == 0) { 1008 MCtrials = MCtrialsCoarse; 1009 tolLL = 1e-2; 1010 cout << "Stochastic REML optimization with MCtrials = " << MCtrials << endl << endl; 1011 } 1012 else { 1013 if (MCtrialsFine <= MCtrialsCoarse) 1014 break; 1015 MCtrials = MCtrialsFine; 1016 tolLL = 1e-4; 1017 cout << "Refining REML optimization with MCtrials = " << MCtrials << endl << endl; 1018 } 1019 1020 // generate env and gen (1 gen per VC) components for rand, data phenotype vectors 1021 double *yEnvGenUnscaledMultiCovCompVecs = ALIGNED_MALLOC_DOUBLES((1+VCs)*(MCtrials+1) * DxNC); 1022 phenoNormsCorrs = genUnscaledMultiCovCompVecs(yEnvGenUnscaledMultiCovCompVecs, phenos, 1023 snpVCnums, VCs, vcXscale2s, MCtrials, seed); 1024 cout << "phenoNormsCorrs" << phenoNormsCorrs << endl; 1025 if (phase == 0 && usePhenoCorrs) 1026 for (int k = 0; k <= VCs; k++) 1027 for (uint64 di = 0; di < D; di++) 1028 for (uint64 dj = 0; dj < di; dj++) 1029 Vegs[k](di, dj) = Vegs[k](dj, di) = 1030 sqrt(Vegs[k](di, di) * Vegs[k](dj, dj)) * phenoNormsCorrs(di, dj); 1031 1032 double *yRandsDataMultiCovCompVecs = ALIGNED_MALLOC_DOUBLES((MCtrials+1) * DxNC); 1033 double *VinvyRandsDataMultiCovCompVecs = ALIGNED_MALLOC_DOUBLES((MCtrials+1) * DxNC); 1034 const double *VinvyMultiCovCompVecs = VinvyRandsDataMultiCovCompVecs + MCtrials * DxNC; 1035 1036 /* EM step 1037 combineEnvGenMultiCovCompVecs(yRandsDataMultiCovCompVecs, yEnvGenUnscaledMultiCovCompVecs, 1038 Vegs, MCtrials); 1039 conjGradSolveVmulti(VinvyRandsDataMultiCovCompVecs, false, yRandsDataMultiCovCompVecs, 1040 MCtrials+1, snpVCnums, vcXscale2s, Vegs, CGmaxIters, CGtol); 1041 updateVegs(Vegs, VinvyRandsDataMultiCovCompVecs, snpVCnums, vcXscale2s, MCtrials); 1042 */ 1043 1044 cout << "Initial variance parameter guesses:" << endl; 1045 for (int k = 0; k <= VCs; k++) 1046 cout << "Vegs[" << k << "]" << Vegs[k] << endl; 1047 cout << endl; 1048 cout << "Performing initial gradient evaluation" << endl; 1049 // compute gradient 1050 combineEnvGenMultiCovCompVecs(yRandsDataMultiCovCompVecs, yEnvGenUnscaledMultiCovCompVecs, 1051 Vegs, MCtrials); 1052 conjGradSolveVmulti(VinvyRandsDataMultiCovCompVecs, false, yRandsDataMultiCovCompVecs, 1053 MCtrials+1, snpVCnums, vcXscale2s, Vegs, CGmaxIters, CGtol); 1054 1055 ublas::vector <double> grad = computeMCgrad(VinvyRandsDataMultiCovCompVecs, D, snpVCnums, VCs, 1056 vcXscale2s, MCtrials); 1057 cout << "grad" << grad << endl << endl; 1058 1059 const double eta1 = 1e-4, eta2 = 0.99; 1060 const double alpha1 = 0.25, alpha2 = 3.5; 1061 double Delta = 1e100; // initialize step norm bound to large 1062 1063 const int AImaxIters = 20; 1064 for (int iter = 0; iter < AImaxIters; iter++) { 1065 cout << "-------------------------------------------------------------------------------" 1066 << endl << endl; 1067 cout << "Start ITER " << (iter+1) << ": computing AI matrix" << endl; 1068 AI = computeAI(Vegs, VinvyMultiCovCompVecs, snpVCnums, vcXscale2s, CGmaxIters, CGtol); 1069 //cout << "AI" << AI << endl; 1070 1071 double dLL = -1; 1072 int att, maxAttempts = 5; 1073 bool converged = false; 1074 for (att = 1; att <= maxAttempts; att++) { 1075 double dLLpred; ublas::vector <double> p; 1076 vector < ublas::matrix <double> > optVegs = 1077 NonlinearOptMulti::constrainedNR(dLLpred, p, Vegs, grad, AI, Delta); 1078 cout << endl << "Constrained Newton-Raphson optimized variance parameters:" << endl; 1079 for (int k = 0; k <= VCs; k++) 1080 cout << "optVegs[" << k << "]" << optVegs[k] << endl; 1081 cout << endl; 1082 cout << "Predicted change in log likelihood: " << dLLpred << endl; 1083 if (dLLpred < tolLL) { 1084 cout << "AI iteration converged: predicted change in log likelihood < tol = " << tolLL 1085 << endl; 1086 Vegs = optVegs; 1087 converged = true; 1088 break; 1089 } 1090 1091 cout << endl << "Computing actual (approximate) change in log likelihood" << endl; 1092 combineEnvGenMultiCovCompVecs(yRandsDataMultiCovCompVecs, yEnvGenUnscaledMultiCovCompVecs, 1093 optVegs, MCtrials); 1094 conjGradSolveVmulti(VinvyRandsDataMultiCovCompVecs, false, yRandsDataMultiCovCompVecs, 1095 MCtrials+1, snpVCnums, vcXscale2s, optVegs, CGmaxIters, CGtol); 1096 ublas::vector <double> optGrad = computeMCgrad(VinvyRandsDataMultiCovCompVecs, D, 1097 snpVCnums, VCs, vcXscale2s, MCtrials); 1098 cout << "grad" << optGrad << endl; 1099 dLL = ublas::inner_prod(p, 0.5*(grad+optGrad)); 1100 cout << endl << "Approximate change in log likelihood: " << dLL 1101 << " (attempt " << att << ")" << endl; 1102 1103 double rho = dLL / dLLpred; 1104 if (ublas::norm_2(optGrad) > 2*ublas::norm_2(grad)) { 1105 rho = -1; 1106 cout << "Large increase in grad norm: dangerous model deviation? Setting rho=-1" 1107 << endl; 1108 } 1109 1110 cout << "rho (approximate / predicted change in LL) = " << rho << endl; 1111 cout << "Old trust region radius: " << Delta << endl; 1112 1113 // update trust region radius 1114 ublas::vector <double> Dp = p; // scale step coordinates using diagonal of AI matrix 1115 for (int par = 0; par < numPars; par++) 1116 Dp(par) *= AI(par, par); 1117 1118 if (rho < eta1) // bad step: reduce trust region 1119 Delta = alpha1 * ublas::norm_2(Dp); 1120 else if (rho < eta2) // ok step: do nothing 1121 ; 1122 else // great step: expand trust region 1123 Delta = std::max(Delta, alpha2 * ublas::norm_2(Dp)); 1124 cout << "New trust region radius: " << Delta << endl; 1125 1126 if (rho > eta1) { // accept step and exit inner loop 1127 cout << "Accepted step" << endl; 1128 Vegs = optVegs; 1129 grad = optGrad; 1130 break; 1131 } 1132 else { 1133 cout << "Rejected step" << endl; 1134 } 1135 } 1136 1137 if (converged) 1138 break; 1139 else if (dLL < 0) { 1140 cerr << "WARNING: Failed to accept step in " << maxAttempts << " attempts" << endl; 1141 cerr << " Stopping AI iteration, but optimization may not have converged" << endl; 1142 break; 1143 } 1144 else { 1145 cout << endl << "End ITER " << (iter+1) << endl; 1146 for (int k = 0; k <= VCs; k++) 1147 cout << "Vegs[" << k << "]" << Vegs[k] << endl; 1148 cout << endl; 1149 } 1150 } 1151 1152 ALIGNED_FREE(VinvyRandsDataMultiCovCompVecs); 1153 ALIGNED_FREE(yRandsDataMultiCovCompVecs); 1154 ALIGNED_FREE(yEnvGenUnscaledMultiCovCompVecs); 1155 1156 } 1157 1158 cout << endl; 1159 ublas::matrix <double> AIinv = MatrixUtils::invert(AI); 1160 cout << "AIinv" << AIinv << endl << endl; 1161 1162 int curPar = 0; 1163 for (int k = 0; k <= VCs; k++) { 1164 cout << "Variance component " << k << ": " << Vegs[k] << endl; 1165 for (uint64 di = 0; di < D; di++) 1166 for (uint64 dj = 0; dj <= di; dj++) { 1167 printf(" entry (%d,%d): %.6f (%.6f)", (int) dj+1, (int) di+1, Vegs[k](di, dj), 1168 sqrt(AIinv(curPar, curPar))); 1169 if (di != dj) 1170 printf(" corr (%d,%d): %.6f", (int) dj+1, (int) di+1, 1171 Vegs[k](di, dj) / sqrt(Vegs[k](di, di) * Vegs[k](dj, dj))); 1172 cout << endl; 1173 curPar++; 1174 } 1175 } 1176 cout << endl; 1177 1178 // apply coordinate transformation to h2, r_g; compute SEs 1179 1180 double sigma2s[D], h2rgs[1+VCs][D][D]; // transformed pars 1181 int par[1+VCs][D][D]; // indices of parameters in [0..numPars) 1182 ublas::matrix <double> J = ublas::zero_matrix <double> (numPars, numPars); 1183 1184 // transform off-diagonals from covariances to correlations (to get SEs on corrs) 1185 curPar = 0; 1186 for (int k = 0; k <= VCs; k++) 1187 for (uint64 i = 0; i < D; i++) 1188 for (uint64 j = 0; j <= i; j++) { 1189 // to get point estimates of gen corrs, just divide by sqrt prod 1190 h2rgs[k][i][j] = i == j ? Vegs[k](i, j) : 1191 Vegs[k](i, j) / sqrt(Vegs[k](i, i) * Vegs[k](j, j)); 1192 par[k][i][j] = curPar++; 1193 } 1194 for (int k = 0; k <= VCs; k++) 1195 for (uint64 i = 0; i < D; i++) 1196 for (uint64 j = 0; j <= i; j++) { 1197 if (i == j) 1198 J(par[k][i][i], par[k][i][i]) = 1; 1199 else { 1200 J(par[k][i][j], par[k][i][i]) = Vegs[k](i, j) / (2*Vegs[k](i, i)); 1201 J(par[k][i][j], par[k][i][j]) = Vegs[k](i, j) / (h2rgs[k][i][j]); 1202 J(par[k][i][j], par[k][j][j]) = Vegs[k](i, j) / (2*Vegs[k](j, j)); 1203 } 1204 } 1205 ublas::matrix <double> rgAI = 1206 ublas::prod(ublas::matrix <double> (ublas::prod(ublas::trans(J), AI)), J); 1207 1208 // transform variance coords to: (sigma2 scale parameter, h2_1, h2_2, ..., h2_VCs) 1209 for (int i = 0; i < (int) D; i++) { 1210 // to get point estimates of sigma2 for each trait, sum raw per-VC sigma2s over VCs 1211 sigma2s[i] = 0; 1212 for (int k = 0; k <= VCs; k++) 1213 sigma2s[i] += Vegs[k](i, i); 1214 // to get point estimate h2s (including env), take fractions (raw sigma2s over VCs) / sum 1215 for (int k = 0; k <= VCs; k++) 1216 h2rgs[k][i][i] = Vegs[k](i, i) / sigma2s[i]; 1217 } 1218 1219 ublas::matrix <double> h2rgAIinv[2]; h2rgAIinv[0] = h2rgAIinv[1] = rgAI; 1220 // leave out one k (VC): h2s for all other VCs, 1-sum(h2s) for left-out kOut 1221 // kOut = 0: get SEs for sigma2, h2 (for all traits), r_g (for all trait pairs) 1222 // kOut = 1: get SEs for environment/noise h2 1223 for (int kOut = 0; kOut < 2; kOut++) { 1224 for (int i = 0; i < (int) D; i++) { // apply transform to parameters for each trait in turn 1225 J = ublas::identity_matrix <double> (numPars); 1226 for (int k = 0; k <= VCs; k++) { 1227 if (k != kOut) { 1228 J(par[kOut][i][i], par[k][i][i]) = -sigma2s[i]; 1229 J(par[k][i][i], par[k][i][i]) = sigma2s[i]; 1230 } 1231 J(par[k][i][i], par[kOut][i][i]) = h2rgs[k][i][i]; 1232 } 1233 // note: these Js don't affect r_g coords 1234 h2rgAIinv[kOut] = 1235 ublas::prod(ublas::matrix <double> (ublas::prod(ublas::trans(J), h2rgAIinv[kOut])), J); 1236 } 1237 h2rgAIinv[kOut] = MatrixUtils::invert(h2rgAIinv[kOut]); 1238 } 1239 1240 for (int i = 0; i < (int) D; i++) 1241 printf("Phenotype %d variance sigma2: %f (%f)\n", i+1, 1242 sigma2s[i] * NumericUtils::sq(phenoNormsCorrs(i, i)), 1243 sqrt(h2rgAIinv[0](i, i)) * NumericUtils::sq(phenoNormsCorrs(i, i))); 1244 cout << endl; 1245 1246 const vector <string> &vcNames = snpData.getVCnames(); 1247 1248 curPar = 0; 1249 for (int k = 0; k <= VCs; k++) { 1250 cout << "Variance component " << k << ": "; 1251 if (k == 0) cout << " (environment/noise)" << endl; 1252 else cout << " \"" << vcNames[k] << "\"" << endl; 1253 for (uint64 di = 0; di < D; di++) 1254 for (uint64 dj = 0; dj <= di; dj++) { 1255 if (di == dj) 1256 printf(" h2%c (%d,%d): %.6f (%.6f)", k==0?'e':'g', (int) dj+1, (int) di+1, 1257 h2rgs[k][di][dj], sqrt((1+1.0/MCtrials) * h2rgAIinv[k==0](curPar, curPar))); 1258 else 1259 printf(" %s corr (%d,%d): %.6f (%.6f)", k==0?"resid":"gen", 1260 (int) dj+1, (int) di+1, h2rgs[k][di][dj], 1261 sqrt((1+1.0/MCtrials) * h2rgAIinv[0](curPar, curPar))); 1262 cout << endl; 1263 curPar++; 1264 } 1265 } 1266 1267 /* 1268 vector <int> digits(VCs); 1269 for (int v = 0; v < VCs; v++) 1270 digits[v] = std::min(10, std::max(2, (int) -log10(std::max(evalData.xSEs(v), 1e-10)) + 2)); 1271 int maxDigits = *std::max_element(digits.begin(), digits.end()); 1272 for (int v = 0; v < VCs; v++) { 1273 char format[100]; 1274 1275 char h2buf[100]; 1276 sprintf(format, "%%.%df", digits[v]); 1277 sprintf(h2buf, format, x[v]); 1278 1279 char h2SEbuf[100]; 1280 sprintf(format, "(%%.%df)", digits[v]); 1281 sprintf(h2SEbuf, format, evalData.xSEs(v)); 1282 1283 sprintf(format, " Variance component %%d: %%-%ds %%-%ds", maxDigits+2, maxDigits+4); 1284 printf(format, v+1, h2buf, h2SEbuf); 1285 cout << " \"" << snpData.getVCnames()[v+1] << "\"" << endl; 1286 } 1287 */ 1288 1289 cout << endl; 1290 } 1291 1292 } 1293