1 /*
2    This file is part of the BOLT-LMM linear mixed model software package
3    developed by Po-Ru Loh.  Copyright (C) 2014-2019 Harvard University.
4 
5    This program is free software: you can redistribute it and/or modify
6    it under the terms of the GNU General Public License as published by
7    the Free Software Foundation, either version 3 of the License, or
8    (at your option) any later version.
9 
10    This program is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13    GNU General Public License for more details.
14 
15    You should have received a copy of the GNU General Public License
16    along with this program.  If not, see <http://www.gnu.org/licenses/>.
17 */
18 
19 #include <cmath>
20 #include <cstring>
21 #include <cstdio>
22 #include <vector>
23 #include <string>
24 #include <iostream>
25 #include <sstream>
26 #include <fstream>
27 #include <map>
28 #include <set>
29 #include <algorithm>
30 #include <numeric>
31 #include <utility>
32 
33 #include "omp.h"
34 
35 #include <boost/random.hpp>
36 #include <boost/random/mersenne_twister.hpp>
37 #include <boost/random/normal_distribution.hpp>
38 #include <boost/numeric/ublas/vector.hpp>
39 #include <boost/numeric/ublas/matrix.hpp>
40 #include <boost/numeric/ublas/io.hpp>
41 
42 #include "Types.hpp"
43 #include "Timer.hpp"
44 #include "SnpData.hpp"
45 #include "CovariateBasis.hpp"
46 #include "NumericUtils.hpp"
47 #include "LapackConst.hpp"
48 #include "MemoryUtils.hpp"
49 #include "Jackknife.hpp"
50 #include "StatsUtils.hpp"
51 #include "MatrixUtils.hpp"
52 #include "NonlinearOptMulti.hpp"
53 #include "Bolt.hpp"
54 
55 namespace LMM {
56 
57   namespace ublas = boost::numeric::ublas;
58   using std::vector;
59   using std::string;
60   using std::pair;
61   using std::cout;
62   using std::cerr;
63   using std::endl;
64 
65   /**
66    * for k=1..VCs, computes (Theta_k[b] - coeffI*I) * x[b] = (Z_k[b] * Z_k[b]' - coeffI*I) * x[b],
67    * where
68    *     Z_k[b] = sqrt(vcXscale2s[k]) X_k
69    *     Theta_k[b] = vcXscale2s[k] X_k * X_k'
70    * (see conjGradSolveW for more info)
71    *
72    * multCovCompVecs: (out) VCs x B x (Nstride+Cstride)
73    * xCovCompVecs: (in) B x (Nstride+Cstride)
74    * h2Xscale2batches: (in) (1+VCs) x B table of combined scale factors for easy application
75    */
multThetaMinusIs(double multCovCompVecs[],const double xCovCompVecs[],const uchar snpVCnums[],const vector<double> & vcXscale2s,uint64 B,double coeffI) const76   void Bolt::multThetaMinusIs(double multCovCompVecs[], const double xCovCompVecs[],
77 			      const uchar snpVCnums[], const vector <double> &vcXscale2s,
78 			      uint64 B, double coeffI) const {
79 #ifdef VERBOSE
80     Timer timer;
81     printf("  Multiplying solutions by variance components... ");
82     fflush(stdout);
83 #endif
84     int VCs = vcXscale2s.size()-1;
85     const uint64 BxNC = B*(Nstride+Cstride);
86     memset(multCovCompVecs, 0, VCs * BxNC * sizeof(multCovCompVecs[0])); // initialize answers to 0
87 
88     double *snpCovCompVecBlock = ALIGNED_MALLOC_DOUBLES(mBlockMultX * (Nstride+Cstride));
89     double (*work)[4] = (double (*)[4]) ALIGNED_MALLOC(omp_get_max_threads() * 256*sizeof(*work));
90     double *XtransVecsBlock = ALIGNED_MALLOC_DOUBLES(mBlockMultX * B);
91 
92     // for each SNP block...
93     for (uint64 m0 = 0; m0 < M; m0 += mBlockMultX) {
94 
95       // (1) load the SNPs in parallel into Xblock [Nstride+Cstride x block size]
96       uint64 mBlockMultXCrop = std::min(M, m0+mBlockMultX) - m0;
97 #pragma omp parallel for
98       for (uint64 mPlus = 0; mPlus < mBlockMultXCrop; mPlus++) {
99 	uint64 m = m0+mPlus;
100 	if (projMaskSnps[m] && snpVCnums[m]) // build snp vector + sign-flipped covar comps
101 	  buildMaskedSnpNegCovCompVec(snpCovCompVecBlock + mPlus * (Nstride+Cstride), m,
102 				      work + (omp_get_thread_num()<<8));
103 	else
104 	  memset(snpCovCompVecBlock + mPlus * (Nstride+Cstride), 0,
105 		 (Nstride+Cstride) * sizeof(snpCovCompVecBlock[0]));
106       }
107 
108       // (2) multiply Xblock' [block size x Nstride+Cstride] * xVecs [Nstride+Cstride x B]
109       //              = XtransVecsBlock [block size x B]
110       //     (note that Xblock' has NEG CCVecs while xVecs has POS CCVecs)
111       {
112 	char TRANSA_ = 'T';
113 	char TRANSB_ = 'N';
114 	int M_ = B;
115 	int N_ = mBlockMultXCrop;
116 	int K_ = Nstride+Cstride;
117 	double ALPHA_ = 1.0;
118 	const double *A_ = xCovCompVecs;
119 	int LDA_ = Nstride+Cstride;
120 	double *B_ = snpCovCompVecBlock;
121 	int LDB_ = Nstride+Cstride;
122 	double BETA_ = 0.0;
123 	double *C_ = XtransVecsBlock;
124 	int LDC_ = B;
125 
126 	DGEMM_MACRO(&TRANSA_, &TRANSB_, &M_, &N_, &K_, &ALPHA_, A_, &LDA_, B_, &LDB_,
127 		    &BETA_, C_, &LDC_);
128       }
129 
130       // (4) for each snp individually in turn:
131       //     multiply Xblock_mPlus [NCstride x 1] * XtransVecsBlock_mPlus [1 x B]
132       //              = snp's contrib. to (k-1)^th result block (X_k X_k') * xVecs [NCstride x B]
133       //     directly accumulate results in (k-1)^th answer mult [NCstride x B]
134       for (uint64 mPlus = 0; mPlus < mBlockMultXCrop; mPlus++) {
135 	uint64 m = m0+mPlus;
136 	if (projMaskSnps[m] && snpVCnums[m]) {
137 	  double *snpCovCompVec = snpCovCompVecBlock + mPlus * (Nstride+Cstride);
138 	  // note that Xblock was loaded with NEG CCVecs; now we need to sign-flip CCVecs
139 	  for (uint64 n = Nstride; n < Nstride+Cstride; n++)
140 	    snpCovCompVec[n] *= -1;
141 
142 	  // update result block for snpVCnums[m]: add XtransVecsBlock_mPlus * snpCovCompVec (DGER)
143 	  {
144 	    int M_ = Nstride+Cstride;
145 	    int N_ = B;
146 	    double ALPHA_ = 1;
147 	    double *X_ = snpCovCompVec;
148 	    int INCX_ = 1;
149 	    double *Y_ = XtransVecsBlock + mPlus * B;
150 	    int INCY_ = 1;
151 	    double *A_ = multCovCompVecs + (snpVCnums[m]-1) * BxNC;
152 	    int LDA_ = Nstride+Cstride;
153 	    DGER_MACRO(&M_, &N_, &ALPHA_, X_, &INCX_, Y_, &INCY_, A_, &LDA_);
154 	  }
155 	}
156       }
157     }
158     ALIGNED_FREE(XtransVecsBlock);
159     ALIGNED_FREE(work);
160     ALIGNED_FREE(snpCovCompVecBlock);
161 
162     // multiply vcXscale2s and subtract identity contributions
163     for (int v = 0; v < VCs; v++)
164       for (uint64 bnc = 0; bnc < BxNC; bnc++)
165 	multCovCompVecs[v*BxNC + bnc] =
166 	  vcXscale2s[v+1] * multCovCompVecs[v*BxNC + bnc] - coeffI * xCovCompVecs[bnc];
167 #ifdef VERBOSE
168     printf("time=%.2f\n", timer.update_time());
169     fflush(stdout);
170 #endif
171   }
172 
rightMultiT(double * x,uint64 D,uint64 stride,const ublas::matrix<double> & mat)173   void rightMultiT(double *x, uint64 D, uint64 stride, const ublas::matrix <double> &mat) {
174     vector <double> tmpD(D);
175     for (uint64 d = 0; d < D; d++)
176       for (uint64 i = 0; i < D; i++)
177 	tmpD[d] += x[i*stride] * mat(d, i);
178     for (uint64 d = 0; d < D; d++)
179       x[d*stride] = tmpD[d];
180   }
181 
dotMultiCovCompVecs(const double xCovCompVecs[],const double yCovCompVecs[],uint64 D) const182   double Bolt::dotMultiCovCompVecs(const double xCovCompVecs[], const double yCovCompVecs[],
183 				   uint64 D) const {
184     double ret = 0;
185     for (uint64 d = 0; d < D; d++)
186       ret += dotCovCompVecs(xCovCompVecs+d*(Nstride+Cstride), yCovCompVecs+d*(Nstride+Cstride));
187     return ret;
188   }
189 
computeMultiProjNorm2s(double projNorm2s[],const double xCovCompVecs[],uint64 D,uint64 B) const190   void Bolt::computeMultiProjNorm2s(double projNorm2s[], const double xCovCompVecs[], uint64 D,
191 				    uint64 B) const {
192     for (uint64 b = 0; b < B; b++) {
193       projNorm2s[b] = 0;
194       for (uint64 d = 0; d < D; d++)
195 	projNorm2s[b] += computeProjNorm2(xCovCompVecs + (b*D+d)*(Nstride+Cstride));
196     }
197   }
198 
199   /**
200    * TODO
201    * NOTE: VinvyMultiCovCompVecs is destroyed if Vegs[0] is non-identity!
202    */
updateVegs(vector<ublas::matrix<double>> & Vegs,double VinvyMultiCovCompVecs[],const uchar snpVCnums[],const vector<double> & vcXscale2s,int MCtrials) const203   ublas::vector <double> Bolt::updateVegs(vector < ublas::matrix <double> > &Vegs,
204 					  double VinvyMultiCovCompVecs[], const uchar snpVCnums[],
205 					  const vector <double> &vcXscale2s, int MCtrials) const {
206 
207     int VCs = Vegs.size()-1;
208     uint64 D = Vegs[0].size1();
209     vector < ublas::matrix <double> > deltaVegs(1+VCs, ublas::zero_matrix <double> (D, D)),
210       VegXscales(1+VCs);
211     for (int k = 0; k <= VCs; k++)
212       VegXscales[k] = (k == 0 ? 1 : sqrt(vcXscale2s[k])) * Vegs[k];
213     vector <double> denoms(1+VCs);
214     denoms[0] = (double) (Nused-Cindep);
215     for (uint64 m = 0; m < M; m++)
216       if (projMaskSnps[m] && snpVCnums[m])
217 	denoms[snpVCnums[m]]++;
218 
219     uint64 DxNC = D * (Nstride+Cstride);
220 
221     uint64 B = MCtrials+1;
222     double *snpCovCompVecBlock = ALIGNED_MALLOC_DOUBLES(mBlockMultX * (Nstride+Cstride));
223     double (*work)[4] = (double (*)[4]) ALIGNED_MALLOC(omp_get_max_threads() * 256*sizeof(*work));
224     double *XtransVecsBlock = ALIGNED_MALLOC_DOUBLES(mBlockMultX * B * D);
225 
226     // for each SNP block...
227     for (uint64 m0 = 0; m0 < M; m0 += mBlockMultX) {
228 
229       // (1) load the SNPs in parallel into Xblock [Nstride+Cstride x block size]
230       uint64 mBlockMultXCrop = std::min(M, m0+mBlockMultX) - m0;
231 #pragma omp parallel for
232       for (uint64 mPlus = 0; mPlus < mBlockMultXCrop; mPlus++) {
233 	uint64 m = m0+mPlus;
234 	if (projMaskSnps[m] && snpVCnums[m]) // build snp vector + sign-flipped covar comps
235 	  buildMaskedSnpNegCovCompVec(snpCovCompVecBlock + mPlus * (Nstride+Cstride), m,
236 				      work + (omp_get_thread_num()<<8));
237 	else
238 	  memset(snpCovCompVecBlock + mPlus * (Nstride+Cstride), 0,
239 		 (Nstride+Cstride) * sizeof(snpCovCompVecBlock[0]));
240       }
241 
242       // (2) multiply Xblock' [block size x Nstride+Cstride] * xMultiVecs [Nstride+Cstride x BxD]
243       //              = XtransVecsBlock [block size x BxD]
244       //     (note that Xblock' has NEG CCVecs while xVecs has POS CCVecs)
245       {
246 #ifdef MEASURE_DGEMM
247       //Timer timer;
248       unsigned long long tsc = Timer::rdtsc();
249 #endif
250 	char TRANSA_ = 'T';
251 	char TRANSB_ = 'N';
252 	int M_ = B * D;
253 	int N_ = mBlockMultXCrop;
254 	int K_ = Nstride+Cstride;
255 	double ALPHA_ = 1.0;
256 	const double *A_ = VinvyMultiCovCompVecs;
257 	int LDA_ = Nstride+Cstride;
258 	double *B_ = snpCovCompVecBlock;
259 	int LDB_ = Nstride+Cstride;
260 	double BETA_ = 0.0;
261 	double *C_ = XtransVecsBlock;
262 	int LDC_ = B * D;
263 
264 	DGEMM_MACRO(&TRANSA_, &TRANSB_, &M_, &N_, &K_, &ALPHA_, A_, &LDA_, B_, &LDB_,
265 		    &BETA_, C_, &LDC_);
266 #ifdef MEASURE_DGEMM
267       dgemmTicks += Timer::rdtsc() - tsc;
268       //dgemmTicks += timer.update_time();
269 #endif
270       }
271 
272       // (3) SNP-wise multiply XtransVecsBlock by appropriate Veg factor along with sqrt(Xscale2s)
273       for (uint64 mPlus = 0; mPlus < mBlockMultXCrop; mPlus++) {
274 	uint64 m = m0+mPlus;
275 	if (snpVCnums[m]) { // SNP belongs to a variance component
276 	  uint64 k = snpVCnums[m];
277 	  for (uint64 b = 0; b < B; b++) {
278 	    uint64 coeffStart = mPlus*B*D + b*D;
279 	    rightMultiT(XtransVecsBlock + coeffStart, D, 1, VegXscales[k]);
280 	    double mult = (int) b == MCtrials ? 1 : -1.0/MCtrials;
281 	    for (uint64 di = 0; di < D; di++)
282 	      for (uint64 dj = 0; dj <= di; dj++)
283 		deltaVegs[k](di, dj) += mult * XtransVecsBlock[coeffStart+di] *
284 		  XtransVecsBlock[coeffStart+dj];
285 	  }
286 	}
287       }
288 
289     }
290     ALIGNED_FREE(XtransVecsBlock);
291     ALIGNED_FREE(work);
292     ALIGNED_FREE(snpCovCompVecBlock);
293 
294     // compute env variance parameter matrix Vegs[0] updates
295     // directly use ***and destroy*** VinvyMultiCovCompVecs: need to rightMultiT by Vegs[0]
296     for (int t = 0; t <= MCtrials; t++) {
297       for (uint64 nc = 0; nc < Nstride+Cstride; nc++) // replace Vinvy with Ue-hats (env)
298 	rightMultiT(VinvyMultiCovCompVecs + t*DxNC + nc, D, Nstride+Cstride, Vegs[0]);
299       double mult = t == MCtrials ? 1 : -1.0/MCtrials;
300       for (uint64 di = 0; di < D; di++)
301 	for (uint64 dj = 0; dj <= di; dj++)
302 	  deltaVegs[0](di, dj) += mult *
303 	    dotCovCompVecs(VinvyMultiCovCompVecs + t*DxNC + di*(Nstride+Cstride),
304 			   VinvyMultiCovCompVecs + t*DxNC + dj*(Nstride+Cstride));
305     }
306 
307     for (int k = 0; k <= VCs; k++) {
308       for (uint64 di = 0; di < D; di++)
309 	for (uint64 dj = 0; dj <= di; dj++)
310 	  deltaVegs[k](dj, di) = deltaVegs[k](di, dj); // symmetrize
311       Vegs[k] += 1/denoms[k] * deltaVegs[k];
312     }
313 
314     int numPars = (1+VCs) * D*(D+1)/2;
315     ublas::vector <double> grad(numPars);
316     int curPar = 0;
317     for (int k = 0; k <= VCs; k++)
318       for (uint64 di = 0; di < D; di++)
319 	for (uint64 dj = 0; dj <= di; dj++)
320 	  grad(curPar++) = (di == dj ? 0.5 : 1.0) * deltaVegs[k](di, dj);
321     return grad;
322   }
323 
computeMCgrad(const double VinvyMultiCovCompVecs[],uint64 D,const uchar snpVCnums[],int VCs,const vector<double> & vcXscale2s,int MCtrials) const324   ublas::vector <double> Bolt::computeMCgrad(const double VinvyMultiCovCompVecs[], uint64 D,
325 					     const uchar snpVCnums[], int VCs,
326 					     const vector <double> &vcXscale2s, int MCtrials)
327     const {
328 
329     int numPars = (1+VCs) * D*(D+1)/2;
330     ublas::vector <double> grad = ublas::zero_vector <double> (numPars);
331     uint64 DxNC = D * (Nstride+Cstride);
332     uint64 B = MCtrials+1;
333     double *snpCovCompVecBlock = ALIGNED_MALLOC_DOUBLES(mBlockMultX * (Nstride+Cstride));
334     double (*work)[4] = (double (*)[4]) ALIGNED_MALLOC(omp_get_max_threads() * 256*sizeof(*work));
335     double *XtransVecsBlock = ALIGNED_MALLOC_DOUBLES(mBlockMultX * B * D);
336 
337     // for each SNP block...
338     for (uint64 m0 = 0; m0 < M; m0 += mBlockMultX) {
339 
340       // (1) load the SNPs in parallel into Xblock [Nstride+Cstride x block size]
341       uint64 mBlockMultXCrop = std::min(M, m0+mBlockMultX) - m0;
342 #pragma omp parallel for
343       for (uint64 mPlus = 0; mPlus < mBlockMultXCrop; mPlus++) {
344 	uint64 m = m0+mPlus;
345 	if (projMaskSnps[m] && snpVCnums[m]) // build snp vector + sign-flipped covar comps
346 	  buildMaskedSnpNegCovCompVec(snpCovCompVecBlock + mPlus * (Nstride+Cstride), m,
347 				      work + (omp_get_thread_num()<<8));
348 	else
349 	  memset(snpCovCompVecBlock + mPlus * (Nstride+Cstride), 0,
350 		 (Nstride+Cstride) * sizeof(snpCovCompVecBlock[0]));
351       }
352 
353       // (2) multiply Xblock' [block size x Nstride+Cstride] * xMultiVecs [Nstride+Cstride x BxD]
354       //              = XtransVecsBlock [block size x BxD]
355       //     (note that Xblock' has NEG CCVecs while xVecs has POS CCVecs)
356       {
357 #ifdef MEASURE_DGEMM
358       //Timer timer;
359       unsigned long long tsc = Timer::rdtsc();
360 #endif
361 	char TRANSA_ = 'T';
362 	char TRANSB_ = 'N';
363 	int M_ = B * D;
364 	int N_ = mBlockMultXCrop;
365 	int K_ = Nstride+Cstride;
366 	double ALPHA_ = 1.0;
367 	const double *A_ = VinvyMultiCovCompVecs;
368 	int LDA_ = Nstride+Cstride;
369 	double *B_ = snpCovCompVecBlock;
370 	int LDB_ = Nstride+Cstride;
371 	double BETA_ = 0.0;
372 	double *C_ = XtransVecsBlock;
373 	int LDC_ = B * D;
374 
375 	DGEMM_MACRO(&TRANSA_, &TRANSB_, &M_, &N_, &K_, &ALPHA_, A_, &LDA_, B_, &LDB_,
376 		    &BETA_, C_, &LDC_);
377 #ifdef MEASURE_DGEMM
378       dgemmTicks += Timer::rdtsc() - tsc;
379       //dgemmTicks += timer.update_time();
380 #endif
381       }
382 
383       // (3) SNP-wise multiply XtransVecsBlock by appropriate Veg factor along with sqrt(Xscale2s)
384       for (uint64 mPlus = 0; mPlus < mBlockMultXCrop; mPlus++) {
385 	uint64 m = m0+mPlus;
386 	if (snpVCnums[m]) { // SNP belongs to a variance component
387 	  uint64 k = snpVCnums[m];
388 	  for (uint64 b = 0; b < B; b++) {
389 	    uint64 coeffStart = mPlus*B*D + b*D;
390 	    double mult = ((int) b == MCtrials ? 1 : -1.0/MCtrials) * vcXscale2s[k];
391 	    int curPar = k*D*(D+1)/2;
392 	    for (uint64 di = 0; di < D; di++)
393 	      for (uint64 dj = 0; dj <= di; dj++)
394 		grad(curPar++) += (di == dj ? 0.5 : 1.0) * mult * XtransVecsBlock[coeffStart+di] *
395 		  XtransVecsBlock[coeffStart+dj];
396 	  }
397 	}
398       }
399 
400     }
401     ALIGNED_FREE(XtransVecsBlock);
402     ALIGNED_FREE(work);
403     ALIGNED_FREE(snpCovCompVecBlock);
404 
405     // compute grad components for env variance parameters
406     for (int t = 0; t <= MCtrials; t++) {
407       double mult = t == MCtrials ? 1 : -1.0/MCtrials;
408       int curPar = 0;
409       for (uint64 di = 0; di < D; di++)
410 	for (uint64 dj = 0; dj <= di; dj++)
411 	  grad(curPar++) += (di == dj ? 0.5 : 1.0) * mult *
412 	    dotCovCompVecs(VinvyMultiCovCompVecs + t*DxNC + di*(Nstride+Cstride),
413 			   VinvyMultiCovCompVecs + t*DxNC + dj*(Nstride+Cstride));
414     }
415 
416     return grad;
417   }
418 
419   /**
420    * computes Vmulti * X for a batch of ((N+C)D)-vectors X (each a stack of D (N+C)-vectors), where
421    *     Vmulti = kron(VegXscale2s[0], I) + SUM_k=1^VCs kron(VegXscale2s[k], X_k*X_k')
422    * and VegXscale2s = Veg .* [1 vcXscale2s[1]..vcXscale2s[VCs]]
423    *
424    * VmultiCovCompVecs: (out) B x D x (Nstride+Cstride)
425    * xMultiCovCompVecs: (in) B x D x (Nstride+Cstride)
426    * VegXscale2s: (in) (1+VCs) x DxD combined scale factors for easy application
427    */
multVmulti(double VmultiCovCompVecs[],const double xMultiCovCompVecs[],const uchar snpVCnums[],const vector<ublas::matrix<double>> & VegXscale2s,uint64 B) const428   void Bolt::multVmulti(double VmultiCovCompVecs[], const double xMultiCovCompVecs[],
429 			const uchar snpVCnums[],
430 			const vector <ublas::matrix <double> > &VegXscale2s, uint64 B) const {
431 
432     // note: algorithm is more efficient than from original multH:
433     // for each block, it performs multXtrans, scales the results, and immediately does multX
434     // this way, SNPs only need to be loaded once, and all VCs can be done at once
435 
436     uint64 D = VegXscale2s[0].size1();
437     uint64 DxNC = D * (Nstride+Cstride);
438     memcpy(VmultiCovCompVecs, xMultiCovCompVecs, B * DxNC * sizeof(VmultiCovCompVecs[0]));
439     // initialize each answer vec Vmulti to X (NC x D) * Ve (D x D) for environment/noise VC
440     for (uint64 b = 0; b < B; b++)
441       for (uint64 nc = 0; nc < Nstride+Cstride; nc++)
442 	rightMultiT(VmultiCovCompVecs + b*DxNC + nc, D, Nstride+Cstride, VegXscale2s[0]);
443 
444     double *snpCovCompVecBlock = ALIGNED_MALLOC_DOUBLES(mBlockMultX * (Nstride+Cstride));
445     double (*work)[4] = (double (*)[4]) ALIGNED_MALLOC(omp_get_max_threads() * 256*sizeof(*work));
446     double *XtransVecsBlock = ALIGNED_MALLOC_DOUBLES(mBlockMultX * B * D);
447 
448     // for each SNP block...
449     for (uint64 m0 = 0; m0 < M; m0 += mBlockMultX) {
450 
451       // (1) load the SNPs in parallel into Xblock [Nstride+Cstride x block size]
452       uint64 mBlockMultXCrop = std::min(M, m0+mBlockMultX) - m0;
453 #pragma omp parallel for
454       for (uint64 mPlus = 0; mPlus < mBlockMultXCrop; mPlus++) {
455 	uint64 m = m0+mPlus;
456 	if (projMaskSnps[m] && snpVCnums[m]) // build snp vector + sign-flipped covar comps
457 	  buildMaskedSnpNegCovCompVec(snpCovCompVecBlock + mPlus * (Nstride+Cstride), m,
458 				      work + (omp_get_thread_num()<<8));
459 	else
460 	  memset(snpCovCompVecBlock + mPlus * (Nstride+Cstride), 0,
461 		 (Nstride+Cstride) * sizeof(snpCovCompVecBlock[0]));
462       }
463 
464       // (2) multiply Xblock' [block size x Nstride+Cstride] * xMultiVecs [Nstride+Cstride x BxD]
465       //              = XtransVecsBlock [block size x BxD]
466       //     (note that Xblock' has NEG CCVecs while xVecs has POS CCVecs)
467       {
468 #ifdef MEASURE_DGEMM
469       //Timer timer;
470       unsigned long long tsc = Timer::rdtsc();
471 #endif
472 	char TRANSA_ = 'T';
473 	char TRANSB_ = 'N';
474 	int M_ = B * D;
475 	int N_ = mBlockMultXCrop;
476 	int K_ = Nstride+Cstride;
477 	double ALPHA_ = 1.0;
478 	const double *A_ = xMultiCovCompVecs;
479 	int LDA_ = Nstride+Cstride;
480 	double *B_ = snpCovCompVecBlock;
481 	int LDB_ = Nstride+Cstride;
482 	double BETA_ = 0.0;
483 	double *C_ = XtransVecsBlock;
484 	int LDC_ = B * D;
485 
486 	DGEMM_MACRO(&TRANSA_, &TRANSB_, &M_, &N_, &K_, &ALPHA_, A_, &LDA_, B_, &LDB_,
487 		    &BETA_, C_, &LDC_);
488 #ifdef MEASURE_DGEMM
489       dgemmTicks += Timer::rdtsc() - tsc;
490       //dgemmTicks += timer.update_time();
491 #endif
492       }
493 
494       // (3) SNP-wise multiply XtransVecsBlock by appropriate VegXscale2 factors
495       for (uint64 mPlus = 0; mPlus < mBlockMultXCrop; mPlus++) {
496 	uint64 m = m0+mPlus;
497 	if (snpVCnums[m]) { // SNP belongs to a variance component
498 	  uint64 k = snpVCnums[m];
499 	  for (uint64 b = 0; b < B; b++)
500 	    rightMultiT(XtransVecsBlock + mPlus*B*D + b*D, D, 1, VegXscale2s[k]);
501 	}
502       }
503 
504       // note that Xblock was loaded with NEG CCVecs; now we need to sign-flip CCVecs
505       for (uint64 mPlus = 0; mPlus < mBlockMultXCrop; mPlus++) {
506 	uint64 m = m0+mPlus;
507 	if (projMaskSnps[m] && snpVCnums[m]) // sign-flip covar comps
508 	  for (uint64 c = 0; c < Cstride; c++)
509 	    snpCovCompVecBlock[mPlus * (Nstride+Cstride) + Nstride + c] *= -1;
510       }
511 
512       // (4) multiply Xblock [NCstride x block size] * scaled XtransVecsBlock [block size x B]
513       //              = block's contribution to SUM_k (scale(k) X_k X_k') * xVecs [NCstride x B]
514       //     directly accumulate results in answer Wmult [NCstride x B]
515       {
516 #ifdef MEASURE_DGEMM
517       //Timer timer;
518       unsigned long long tsc = Timer::rdtsc();
519 #endif
520 	char TRANSA_ = 'N';
521 	char TRANSB_ = 'T';
522 	int M_ = Nstride+Cstride;
523 	int N_ = B * D;
524 	int K_ = mBlockMultXCrop;
525 	double ALPHA_ = 1.0;
526 	double *A_ = snpCovCompVecBlock;
527 	int LDA_ = Nstride+Cstride;
528 	const double *B_ = XtransVecsBlock;
529 	int LDB_ = B * D;
530 	double BETA_ = 1.0;
531 	double *C_ = VmultiCovCompVecs;
532 	int LDC_ = Nstride+Cstride;
533 	DGEMM_MACRO(&TRANSA_, &TRANSB_, &M_, &N_, &K_, &ALPHA_, A_, &LDA_, B_, &LDB_,
534 		    &BETA_, C_, &LDC_);
535 #ifdef MEASURE_DGEMM
536       dgemmTicks += Timer::rdtsc() - tsc;
537       //dgemmTicks += timer.update_time();
538 #endif
539       }
540 
541     }
542     ALIGNED_FREE(XtransVecsBlock);
543     ALIGNED_FREE(work);
544     ALIGNED_FREE(snpCovCompVecBlock);
545   }
546 
547   /**
548    * solves a batch of B equations
549    *     Vmulti * vec(X) = b
550    * where
551    *     Vmulti = kron(Vegs[0], I) + SUM_k=1^VCs kron(Vegs[k], vcXscale2s[k] * X_k*X_k')
552    * and vcXscale2s[k] = 1/("M_k" = sum(Xnorm2s[snpVCnums==k])/(Nused-Cindep))
553    * (and as usual, projections of x, b, and columns of X are implicitly represented via covComps)
554    *
555    * xMultiCovCompVecs: (in/out) B x D x Nstride+Cstride
556    * useStartVecs: use input xMultiCovCompVecs as initial guess for iteration
557    * bMultiCovCompVecs: (in) B x D x Nstride+Cstride
558    * Vegs: (in) (1+VCs) x DxD covariance parameters
559    */
conjGradSolveVmulti(double xMultiCovCompVecs[],bool useStartVecs,const double bMultiCovCompVecs[],uint64 B,const uchar snpVCnums[],const vector<double> & vcXscale2s,const vector<ublas::matrix<double>> & Vegs,int maxIters,double CGtol) const560   void Bolt::conjGradSolveVmulti(double xMultiCovCompVecs[], bool useStartVecs,
561 				 const double bMultiCovCompVecs[], uint64 B,
562 				 const uchar snpVCnums[], const vector <double> &vcXscale2s,
563 				 const vector < ublas::matrix <double> > &Vegs, int maxIters,
564 				 double CGtol) const {
565 
566     int VCs = Vegs.size()-1;
567     uint64 D = Vegs[0].size1();
568 
569     // combine Vegs and vcXscale2s into VegXscale2s
570     vector < ublas::matrix <double> > VegXscale2s(1+VCs);
571     VegXscale2s[0] = Vegs[0];
572     for (int k = 1; k <= VCs; k++)
573       VegXscale2s[k] = vcXscale2s[k] * Vegs[k];
574 
575 #ifdef VERBOSE
576     Timer timer;
577     cout << "  Batch-solving " << B << " systems of equations using conjugate gradient iteration"
578 	 << endl;
579 #endif
580 #ifdef MEASURE_DGEMM
581     unsigned long long tscStart = Timer::rdtsc();
582     dgemmTicks = 0;
583 #endif
584 
585     const uint64 BxDxNC = B * D * (Nstride+Cstride);
586 
587     vector <double> r2orig(B), r2olds(B), r2news(B);
588     computeMultiProjNorm2s(&r2orig[0], bMultiCovCompVecs, D, B);
589 
590     double *rMultiCovCompVecs = ALIGNED_MALLOC_DOUBLES(BxDxNC);
591     double *VmultiCovCompVecs = ALIGNED_MALLOC_DOUBLES(BxDxNC);
592     if (useStartVecs) {
593       multVmulti(VmultiCovCompVecs, xMultiCovCompVecs, snpVCnums, VegXscale2s, B); // V*x
594       for (uint64 bdnc = 0; bdnc < BxDxNC; bdnc++)
595 	rMultiCovCompVecs[bdnc] = bMultiCovCompVecs[bdnc] - VmultiCovCompVecs[bdnc]; // r=b-V*x
596       computeMultiProjNorm2s(&r2olds[0], rMultiCovCompVecs, D, B); // rsold=r'*r
597     }
598     else { // starting at x=0
599       memset(xMultiCovCompVecs, 0, BxDxNC * sizeof(rMultiCovCompVecs[0]));
600       memcpy(rMultiCovCompVecs, bMultiCovCompVecs, BxDxNC * sizeof(rMultiCovCompVecs[0]));
601       r2olds = r2orig;
602     }
603 
604     double *pMultiCovCompVecs = ALIGNED_MALLOC_DOUBLES(BxDxNC);
605     memcpy(pMultiCovCompVecs, rMultiCovCompVecs, BxDxNC * sizeof(pMultiCovCompVecs[0])); // p=r
606 
607     for (int iter = 0; iter < maxIters; iter++) {
608       multVmulti(VmultiCovCompVecs, pMultiCovCompVecs, snpVCnums, VegXscale2s, B); // V*p
609 
610       for (uint64 bdnc = 0, b = 0; b < B; b++) {
611 	double *p = pMultiCovCompVecs + b * D*(Nstride+Cstride);
612 	double *Vp = VmultiCovCompVecs + b * D*(Nstride+Cstride);
613 
614 	// alpha=rsold/(p'*Ap)
615 	double alpha = r2olds[b] / dotMultiCovCompVecs(p, Vp, D);
616 
617 	for (uint64 dnc = 0; dnc < D*(Nstride+Cstride); dnc++, bdnc++) {
618 	  xMultiCovCompVecs[bdnc] += alpha * pMultiCovCompVecs[bdnc]; //x=x+alpha*p
619 	  rMultiCovCompVecs[bdnc] -= alpha * VmultiCovCompVecs[bdnc]; //r=r-alpha*Ap
620 	}
621       }
622 
623       computeMultiProjNorm2s(&r2news[0], rMultiCovCompVecs, D, B); // rsnew=r'*r
624 
625 #ifdef VERBOSE
626       double min_rRatio = 1e9, max_rRatio = 0;
627       for (uint64 b = 0; b < B; b++) {
628 	double rRatio = sqrt(r2news[b] / r2orig[b]);
629 	min_rRatio = std::min(rRatio, min_rRatio);
630 	max_rRatio = std::max(rRatio, max_rRatio);
631       }
632 
633       vector <double> resNorm2s(B);
634       computeMultiProjNorm2s(&resNorm2s[0], xMultiCovCompVecs, D, B);
635 
636       printf("  iter %d:  time=%.2f  rNorms/orig: (%.1g,%.1g)  res2s: %g..%g\n",
637 	     iter+1, timer.update_time(), min_rRatio, max_rRatio, resNorm2s[0], resNorm2s[B-1]);
638       fflush(stdout);
639 #endif
640 
641       // check convergence
642       bool converged = true;
643       for (uint64 b = 0; b < B; b++)
644 	if (sqrt(r2news[b] / r2orig[b]) > CGtol)
645 	  converged = false;
646       if (converged) {
647 	cout << "  Converged at iter " << iter+1 << ": rNorms/orig all < CGtol=" << CGtol << endl;
648 	break;
649       }
650 
651       for (uint64 bdnc = 0, b = 0; b < B; b++) {
652 	double r2ratio = r2news[b] / r2olds[b];
653 	for (uint64 dnc = 0; dnc < D*(Nstride+Cstride); dnc++, bdnc++) // p=r+rsnew/rsold*p
654 	  pMultiCovCompVecs[bdnc] = rMultiCovCompVecs[bdnc] + r2ratio * pMultiCovCompVecs[bdnc];
655       }
656 
657       r2olds = r2news; // rsold=rsnew
658     }
659 
660     ALIGNED_FREE(pMultiCovCompVecs);
661     ALIGNED_FREE(VmultiCovCompVecs);
662     ALIGNED_FREE(rMultiCovCompVecs);
663 #ifdef MEASURE_DGEMM
664     double dgemmPct = 100 * dgemmTicks / (double) (Timer::rdtsc()-tscStart);
665     printf("  Time breakdown: dgemm = %.1f%%, memory/overhead = %.1f%%\n", dgemmPct, 100-dgemmPct);
666     fflush(stdout);
667 #endif
668   }
669 
670   /**
671    * creates yRandsData (MCtrials+1) MultiCovCompVecs as linear combinations of yEnvGenUnscaled
672    * viewing yEnvGenUnscaled as (1+VCs) blocks of size (MCtrials+1) x D x (Nstride+Cstride),
673    * yRandsData = lin. comb. of these blocks, where each NCxD chunk is right-mult by chol(Vegs)'
674    * (note that the last rep is the data and just gets yEnvGenUnscaled(0,MCtrials,:) copied in)
675    *
676    * yRandsDataMultiCovCompVecs: (out) (MCtrials+1) x D x (Nstride+Cstride)
677    * yEnvGenUnscaledMultiCovCompVecs: (in) (1+VCs) x (MCtrials+1) x D x (Nstride+Cstride)
678    *
679    */
combineEnvGenMultiCovCompVecs(double yRandsDataMultiCovCompVecs[],const double yEnvGenUnscaledMultiCovCompVecs[],const vector<ublas::matrix<double>> & Vegs,int MCtrials) const680   void Bolt::combineEnvGenMultiCovCompVecs(double yRandsDataMultiCovCompVecs[],
681 					   const double yEnvGenUnscaledMultiCovCompVecs[],
682 					   const vector < ublas::matrix <double> > &Vegs,
683 					   int MCtrials) const {
684 
685     int VCs = Vegs.size()-1;
686     uint64 D = Vegs[0].size1();
687     vector < ublas::matrix <double> > cholVegs(1+VCs);
688     for (int k = 0; k <= VCs; k++)
689       cholVegs[k] = MatrixUtils::chol(Vegs[k]);
690     uint64 DxNC = D * (Nstride+Cstride);
691     uint64 blockSize = (MCtrials+1) * DxNC;
692     memcpy(yRandsDataMultiCovCompVecs, yEnvGenUnscaledMultiCovCompVecs, // copy in env terms
693 	   blockSize * sizeof(yRandsDataMultiCovCompVecs[0]));
694     vector <double> tmpD(D);
695     for (int t = 0; t < MCtrials; t++)
696       for (uint64 nc = 0; nc < Nstride+Cstride; nc++) {
697 	// apply chol' to env terms currently in yRands (but not data rep)
698 	rightMultiT(yRandsDataMultiCovCompVecs + t*DxNC + nc, D, Nstride+Cstride, cholVegs[0]);
699 	// apply chol' to gen terms and accumulate in yRands
700 	for (int k = 1; k <= VCs; k++) {
701 	  const double *yGenUnscaledMultiCCVecs_k = yEnvGenUnscaledMultiCovCompVecs + k*blockSize;
702 	  for (uint64 d = 0; d < D; d++)
703 	    tmpD[d] = yGenUnscaledMultiCCVecs_k[t*DxNC + d*(Nstride+Cstride) + nc];
704 	  rightMultiT(&tmpD[0], D, 1, cholVegs[k]);
705 	  for (uint64 d = 0; d < D; d++)
706 	    yRandsDataMultiCovCompVecs[t*DxNC + d*(Nstride+Cstride) + nc] += tmpD[d];
707 	}
708       }
709   }
710 
711   /**
712    * TODO: update comments (basically the same as single-trait version but with MCtrials x D rands)
713    *
714    * generates 1+VCs blocks of MCtrials+1 (rand+data) component vecs for building MCreml phenotypes
715    * later, blocks will be combined with coeffs 1, rho_1, ..., rho_VCs
716    * components are pre-scaled only with sqrt(vcXscale2s) (~1/sqrt(M_k)), putting Xs on same scale
717    *
718    * yEnvGenUnscaledCovCompVecs: (out) (1+VCs) x (MCtrials+1) x (Nstride+Cstride)
719    *                                   env gen    rands data
720    *                         data rep:  y  000
721    *
722    * - projecting out covariates: implicitly done by covComps
723    * - applying maskIndivs: automatically done to snps (=> Gen component) by buildMaskedSnpVector
724    *                        needs to be applied to EnvUnscaled component
725    * - data layout: 1+VCs batches of {MCtrials rand reps + 1 data rep}
726    *   - first batch holds Env components
727    *   - remaining VCs batches hold Gen components
728    *   - in each batch:
729    *     - 0..MCtrials-1: Env, Gen components of random phenotypes
730    *     - MCtrials: Env = pheno from data; Gen = 0
731    *
732    * pheno: (in) real phenotype (data rep), possibly of size N or zero-filled beyond (no covComps)
733    * snpVCnums: (in) M-vector of assignments of SNPs to VCs (0 -> ignore; 1..VCs -> var comps)
734    * VCs: (in) number of non-identity VCs
735    * vcXscale2s: (in) (VCs+1)-vector of squared scale factors that normalize X's (ignore 0th entry)
736    *
737    * return: phenotype normalizations and correlations
738    */
genUnscaledMultiCovCompVecs(double yEnvGenUnscaledMultiCovCompVecs[],const vector<vector<double>> & phenos,const uchar snpVCnums[],int VCs,const vector<double> & vcXscale2s,int MCtrials,int seed) const739   ublas::matrix <double> Bolt::genUnscaledMultiCovCompVecs
740   (double yEnvGenUnscaledMultiCovCompVecs[], const vector < vector <double> > &phenos,
741    const uchar snpVCnums[], int VCs, const vector <double> &vcXscale2s, int MCtrials, int seed)
742     const {
743 
744     boost::mt19937 rng(seed+54321);
745     boost::variate_generator<boost::mt19937&, boost::normal_distribution<> >
746       randn(rng, boost::normal_distribution<>(0.0, 1.0));
747 
748     uint64 D = phenos.size();
749     uint64 DxNC = D * (Nstride+Cstride);
750 
751     // return matrix: phenotype normalizations and corrs
752     ublas::matrix <double> phenoNormsCorrs = ublas::zero_matrix <double> (D, D);
753 
754     // zero-initialize all vectors
755     memset(yEnvGenUnscaledMultiCovCompVecs, 0,
756 	   (1+VCs) * (MCtrials+1) * DxNC * sizeof(yEnvGenUnscaledMultiCovCompVecs[0]));
757 
758     // put pheno in Env (first=0 of 1+VCs) block, MCtrials (last=MCtrials of MCtrials+1) DxNC vec
759     double *phenoCovCompVecs = yEnvGenUnscaledMultiCovCompVecs + MCtrials * DxNC;
760     for (uint64 d = 0; d < D; d++) {
761       double *phenoCovCompVecs_d = phenoCovCompVecs + d*(Nstride+Cstride);
762       memcpy(phenoCovCompVecs_d, &phenos[d][0], phenos[d].size() * sizeof(phenoCovCompVecs[0]));
763       covBasis.applyMaskIndivs(phenoCovCompVecs_d);
764       covBasis.computeCindepComponents(phenoCovCompVecs_d + Nstride, phenoCovCompVecs_d);
765       // rescale phenos to have norm approximately 1
766       double scale = sqrt((Nused-Cindep) / computeProjNorm2(phenoCovCompVecs_d));
767       phenoNormsCorrs(d, d) = 1/scale;
768       for (uint nc = 0; nc < Nstride+Cstride; nc++)
769 	phenoCovCompVecs_d[nc] *= scale;
770     }
771     for (uint64 i = 0; i < D; i++)
772       for (uint64 j = i+1; j < D; j++)
773 	phenoNormsCorrs(i, j) = phenoNormsCorrs(j, i) =
774 	  dotCovCompVecs(phenoCovCompVecs + i*(Nstride+Cstride),
775 			 phenoCovCompVecs + j*(Nstride+Cstride)) / (Nused-Cindep);
776 
777     // generate yGen: VCs x (MCtrials+1) x DxNC with rand betas generated on-the-fly!
778     double *snpCovCompVec = ALIGNED_MALLOC_DOUBLES(Nstride+Cstride);
779     double (*work)[4] = (double (*)[4]) ALIGNED_MALLOC(256*sizeof(*work));
780     double *betas_m = ALIGNED_MALLOC_DOUBLES(MCtrials*D);
781     for (uint64 m = 0; m < M; m++)
782       if (projMaskSnps[m] && snpVCnums[m]) {
783 	// loop through SNPs; load just 1 (Nstride+Cstride) SNP at a time
784 	buildMaskedSnpCovCompVec(snpCovCompVec, m, work);
785 	for (int t = 0; t < MCtrials * (int) D; t++) // generate MCtrials*D random betas
786 	  betas_m[t] = randn();
787 	// DGER to update MCtrials x D x (Nstride+Cstride) block *for appropriate VC*
788 	uint64 k = snpVCnums[m];
789 	double *yGenBlock = yEnvGenUnscaledMultiCovCompVecs + k * (MCtrials+1) * DxNC;
790 	// update yGenBlock: add betas_m * snpCovCompVec (DGER)
791 	{
792 	  int M_ = Nstride+Cstride;
793 	  int N_ = MCtrials*D;
794 	  double ALPHA_ = sqrt(vcXscale2s[k]); // normalize 1/sqrt("M"=sum(Xnorm2s)/(Nused-Cindep))
795 	  double *X_ = snpCovCompVec;
796 	  int INCX_ = 1;
797 	  double *Y_ = betas_m;
798 	  int INCY_ = 1;
799 	  double *A_ = yGenBlock;
800 	  int LDA_ = Nstride+Cstride;
801 	  DGER_MACRO(&M_, &N_, &ALPHA_, X_, &INCX_, Y_, &INCY_, A_, &LDA_);
802 	}
803       }
804     ALIGNED_FREE(betas_m);
805     ALIGNED_FREE(work);
806     ALIGNED_FREE(snpCovCompVec);
807 
808     // generate yEnv: first MCtrials x (Nstride+Cstride) block of output array
809     // (last 1 x (Nstride+Cstride) is real data phenotype)
810     for (int t = 0; t < MCtrials; t++) {
811       // EnvUnscaled: epsCovCompVec <- N randn, after the following processing...
812       // - mask out maskIndivs (=> norm2 ~ Nused)
813       // - compute covComps: implicitly project out covars (=> norm2-SUM(comps2) ~ Nused-Cindep)
814       for (uint64 d = 0; d < D; d++) {
815 	double *randnEpsCovCompVec = yEnvGenUnscaledMultiCovCompVecs + t*DxNC+d*(Nstride+Cstride);
816 	for (uint64 n = 0; n < Nstride; n++)
817 	  if (maskIndivs[n])
818 	    randnEpsCovCompVec[n] = randn();
819 	// no need to zero out components after Cindep: already 0-initialized
820 	covBasis.computeCindepComponents(randnEpsCovCompVec + Nstride, randnEpsCovCompVec);
821       }
822     }
823 
824     return phenoNormsCorrs;
825   }
826 
updateVegsAI(vector<ublas::matrix<double>> & Vegs,const uchar snpVCnums[],const vector<double> & vcXscale2s,const double yRandsDataMultiCovCompVecs[],int MCtrials,int CGmaxIters,double CGtol) const827   void Bolt::updateVegsAI(vector < ublas::matrix <double> > &Vegs, const uchar snpVCnums[],
828 			  const vector <double> &vcXscale2s,
829 			  const double yRandsDataMultiCovCompVecs[], int MCtrials, int CGmaxIters,
830 			  double CGtol) const {
831 
832     int VCs = Vegs.size()-1;
833     uint64 D = Vegs[0].size1();
834     uint64 DxNC = D * (Nstride+Cstride);
835 
836     double *VinvyRandsDataMultiCovCompVecs = ALIGNED_MALLOC_DOUBLES((MCtrials+1) * DxNC);
837 
838     conjGradSolveVmulti(VinvyRandsDataMultiCovCompVecs, false, yRandsDataMultiCovCompVecs,
839 			MCtrials+1, snpVCnums, vcXscale2s, Vegs, CGmaxIters, CGtol);
840 
841     // compute gradient
842     vector < ublas::matrix <double> > identityVegs(1+VCs, ublas::identity_matrix <double> (D));
843     ublas::vector <double> grad = updateVegs(identityVegs, VinvyRandsDataMultiCovCompVecs,
844 					     snpVCnums, vcXscale2s, MCtrials);
845     cout << "grad" << grad << endl;
846 
847     // compute AI matrix
848 
849     const double *VinvyMultiCovCompVecs = VinvyRandsDataMultiCovCompVecs + MCtrials * DxNC;
850 
851     double *ThetasVinvyMultiCovCompVecs = ALIGNED_MALLOC_DOUBLES(VCs * DxNC);
852 
853     multThetaMinusIs(ThetasVinvyMultiCovCompVecs, VinvyMultiCovCompVecs, snpVCnums, vcXscale2s,
854 		     D, 0);
855 
856     int numPars = (1+VCs) * D*(D+1)/2;
857     double *dVdparsVinvyMultiCovCompVecs = ALIGNED_MALLOC_DOUBLES(numPars * DxNC);
858 
859     memset(dVdparsVinvyMultiCovCompVecs, 0, numPars*DxNC*sizeof(dVdparsVinvyMultiCovCompVecs[0]));
860     int curPar = 0;
861     for (int k = 0; k <= VCs; k++) { // populate dVdparsVinvyMultiCovCompVecs with computed vecs
862       const double *ThetasVinvyMultiCCVecs_k = k == 0 ? VinvyMultiCovCompVecs :
863 	ThetasVinvyMultiCovCompVecs + (k-1) * DxNC;
864       for (uint64 di = 0; di < D; di++)
865 	for (uint64 dj = 0; dj <= di; dj++) {
866 	  memcpy(dVdparsVinvyMultiCovCompVecs + curPar * DxNC + dj*(Nstride+Cstride),
867 		 ThetasVinvyMultiCCVecs_k + di*(Nstride+Cstride),
868 		 (Nstride+Cstride)*sizeof(ThetasVinvyMultiCCVecs_k[0]));
869 	  if (di != dj)
870 	    memcpy(dVdparsVinvyMultiCovCompVecs + curPar * DxNC + di*(Nstride+Cstride),
871 		   ThetasVinvyMultiCCVecs_k + dj*(Nstride+Cstride),
872 		   (Nstride+Cstride)*sizeof(ThetasVinvyMultiCCVecs_k[0]));
873 	  curPar++;
874 	}
875     }
876 
877     double *VinvdVdparsVinvyMultiCovCompVecs = ALIGNED_MALLOC_DOUBLES(numPars * DxNC);
878 
879     conjGradSolveVmulti(VinvdVdparsVinvyMultiCovCompVecs, false, dVdparsVinvyMultiCovCompVecs,
880 			numPars, snpVCnums, vcXscale2s, Vegs, CGmaxIters, CGtol);
881 
882     ublas::matrix <double> AI = ublas::zero_matrix <double> (numPars, numPars);
883 
884     for (int pi = 0; pi < numPars; pi++)
885       for (int pj = 0; pj < numPars; pj++)
886 	AI(pi, pj) = pj < pi ? AI(pj, pi) :
887 	  -0.5 * dotMultiCovCompVecs(VinvdVdparsVinvyMultiCovCompVecs + pi * DxNC,
888 				     dVdparsVinvyMultiCovCompVecs + pj * DxNC, D);
889     cout << "AI" << AI << endl;
890 
891     ublas::vector <double> step = -MatrixUtils::linSolve(AI, grad);
892     cout << "step" << step << endl;
893 
894     curPar = 0;
895     for (int k = 0; k <= VCs; k++)
896       for (uint64 di = 0; di < D; di++)
897 	for (uint64 dj = 0; dj <= di; dj++) {
898 	  Vegs[k](di, dj) += step(curPar++);
899 	  Vegs[k](dj, di) = Vegs[k](di, dj);
900 	}
901 
902     ALIGNED_FREE(VinvdVdparsVinvyMultiCovCompVecs);
903     ALIGNED_FREE(dVdparsVinvyMultiCovCompVecs);
904     ALIGNED_FREE(ThetasVinvyMultiCovCompVecs);
905     ALIGNED_FREE(VinvyRandsDataMultiCovCompVecs);
906   }
907 
computeAI(const vector<ublas::matrix<double>> & Vegs,const double VinvyMultiCovCompVecs[],const uchar snpVCnums[],const vector<double> & vcXscale2s,int CGmaxIters,double CGtol) const908   ublas::matrix <double> Bolt::computeAI(const vector < ublas::matrix <double> > &Vegs,
909 					 const double VinvyMultiCovCompVecs[],
910 					 const uchar snpVCnums[],
911 					 const vector <double> &vcXscale2s, int CGmaxIters,
912 					 double CGtol) const {
913 
914     int VCs = Vegs.size()-1;
915     uint64 D = Vegs[0].size1();
916     uint64 DxNC = D * (Nstride+Cstride);
917 
918     double *ThetasVinvyMultiCovCompVecs = ALIGNED_MALLOC_DOUBLES(VCs * DxNC);
919 
920     multThetaMinusIs(ThetasVinvyMultiCovCompVecs, VinvyMultiCovCompVecs, snpVCnums, vcXscale2s,
921 		     D, 0);
922 
923     int numPars = (1+VCs) * D*(D+1)/2;
924     double *dVdparsVinvyMultiCovCompVecs = ALIGNED_MALLOC_DOUBLES(numPars * DxNC);
925 
926     memset(dVdparsVinvyMultiCovCompVecs, 0, numPars*DxNC*sizeof(dVdparsVinvyMultiCovCompVecs[0]));
927     int curPar = 0;
928     for (int k = 0; k <= VCs; k++) { // populate dVdparsVinvyMultiCovCompVecs with computed vecs
929       const double *ThetasVinvyMultiCCVecs_k = k == 0 ? VinvyMultiCovCompVecs :
930 	ThetasVinvyMultiCovCompVecs + (k-1) * DxNC;
931       for (uint64 di = 0; di < D; di++)
932 	for (uint64 dj = 0; dj <= di; dj++) {
933 	  memcpy(dVdparsVinvyMultiCovCompVecs + curPar * DxNC + dj*(Nstride+Cstride),
934 		 ThetasVinvyMultiCCVecs_k + di*(Nstride+Cstride),
935 		 (Nstride+Cstride)*sizeof(ThetasVinvyMultiCCVecs_k[0]));
936 	  if (di != dj)
937 	    memcpy(dVdparsVinvyMultiCovCompVecs + curPar * DxNC + di*(Nstride+Cstride),
938 		   ThetasVinvyMultiCCVecs_k + dj*(Nstride+Cstride),
939 		   (Nstride+Cstride)*sizeof(ThetasVinvyMultiCCVecs_k[0]));
940 	  curPar++;
941 	}
942     }
943 
944     double *VinvdVdparsVinvyMultiCovCompVecs = ALIGNED_MALLOC_DOUBLES(numPars * DxNC);
945 
946     conjGradSolveVmulti(VinvdVdparsVinvyMultiCovCompVecs, false, dVdparsVinvyMultiCovCompVecs,
947 			numPars, snpVCnums, vcXscale2s, Vegs, CGmaxIters, CGtol);
948 
949     ublas::matrix <double> AI = ublas::zero_matrix <double> (numPars, numPars);
950 
951     for (int pi = 0; pi < numPars; pi++)
952       for (int pj = 0; pj < numPars; pj++)
953 	AI(pi, pj) = pj < pi ? AI(pj, pi) :
954 	  0.5 * dotMultiCovCompVecs(VinvdVdparsVinvyMultiCovCompVecs + pi * DxNC,
955 				    dVdparsVinvyMultiCovCompVecs + pj * DxNC, D);
956 
957     ALIGNED_FREE(VinvdVdparsVinvyMultiCovCompVecs);
958     ALIGNED_FREE(dVdparsVinvyMultiCovCompVecs);
959     ALIGNED_FREE(ThetasVinvyMultiCovCompVecs);
960 
961     return AI;
962   }
963 
964   /**
965    * pheno: (in) real phenotype (data rep), possibly of size N or zero-filled beyond (no covComps)
966    * snpVCnums: (in) M-vector of assignments of SNPs to VCs (0 -> ignore; 1..VCs -> var comps)
967    */
remlAI(vector<ublas::matrix<double>> & Vegs,bool usePhenoCorrs,const vector<vector<double>> & phenos,const uchar snpVCnums[],int MCtrialsCoarse,int MCtrialsFine,int CGmaxIters,double CGtol,int seed) const968   void Bolt::remlAI(vector < ublas::matrix <double> > &Vegs, bool usePhenoCorrs,
969 		    const vector < vector <double> > &phenos, const uchar snpVCnums[],
970 		    int MCtrialsCoarse, int MCtrialsFine, int CGmaxIters, double CGtol,
971 		    int seed) const {
972 
973     // determine number of VCs and scale factors vcXscale2s = 1 / "M_k":
974     // view each X_k as X_k * 1/sqrt("M_k" = sum(Xnorm2s)/(Nused-Cindep));
975     // then all 1+VCs var comps (inc. identity) are on same footing
976     int VCs = 0; vector <double> vcXscale2s(1, 1);
977     for (uint64 m = 0; m < M; m++) {
978       if (projMaskSnps[m] && snpVCnums[m] > VCs) {
979 	VCs = snpVCnums[m];
980 	vcXscale2s.resize(VCs+1);
981       }
982       if (projMaskSnps[m] && snpVCnums[m])
983 	vcXscale2s[snpVCnums[m]] += Xnorm2s[m];
984     }
985     for (int k = 1; k <= VCs; k++)
986       vcXscale2s[k] = (Nused-Cindep)/vcXscale2s[k];
987 
988     if (VCs != (int) Vegs.size() - 1) {
989       cerr << "ERROR: # of VCs represented in non-masked SNPs does not match # in model" << endl;
990       cerr << "       Did a variance component lose all of its SNPs during Bolt QC?" << endl;
991       exit(1);
992     }
993 
994     uint64 D = phenos.size();
995     uint64 DxNC = D * (Nstride+Cstride);
996     int numPars = (1+VCs) * D*(D+1)/2;
997 
998     ublas::matrix <double> AI;
999     ublas::matrix <double> phenoNormsCorrs;
1000     int MCtrials;
1001     double tolLL;
1002 
1003     for (int phase = 0; phase < 2; phase++) {
1004       cout << endl
1005 	   << "==============================================================================="
1006 	   << endl << endl;
1007       if (phase == 0) {
1008 	MCtrials = MCtrialsCoarse;
1009 	tolLL = 1e-2;
1010 	cout << "Stochastic REML optimization with MCtrials = " << MCtrials << endl << endl;
1011       }
1012       else {
1013 	if (MCtrialsFine <= MCtrialsCoarse)
1014 	  break;
1015 	MCtrials = MCtrialsFine;
1016 	tolLL = 1e-4;
1017 	cout << "Refining REML optimization with MCtrials = " << MCtrials << endl << endl;
1018       }
1019 
1020     // generate env and gen (1 gen per VC) components for rand, data phenotype vectors
1021     double *yEnvGenUnscaledMultiCovCompVecs = ALIGNED_MALLOC_DOUBLES((1+VCs)*(MCtrials+1) * DxNC);
1022     phenoNormsCorrs = genUnscaledMultiCovCompVecs(yEnvGenUnscaledMultiCovCompVecs, phenos,
1023 						  snpVCnums, VCs, vcXscale2s, MCtrials, seed);
1024     cout << "phenoNormsCorrs" << phenoNormsCorrs << endl;
1025     if (phase == 0 && usePhenoCorrs)
1026       for (int k = 0; k <= VCs; k++)
1027 	for (uint64 di = 0; di < D; di++)
1028 	  for (uint64 dj = 0; dj < di; dj++)
1029 	    Vegs[k](di, dj) = Vegs[k](dj, di) =
1030 	      sqrt(Vegs[k](di, di) * Vegs[k](dj, dj)) * phenoNormsCorrs(di, dj);
1031 
1032     double *yRandsDataMultiCovCompVecs = ALIGNED_MALLOC_DOUBLES((MCtrials+1) * DxNC);
1033     double *VinvyRandsDataMultiCovCompVecs = ALIGNED_MALLOC_DOUBLES((MCtrials+1) * DxNC);
1034     const double *VinvyMultiCovCompVecs = VinvyRandsDataMultiCovCompVecs + MCtrials * DxNC;
1035 
1036       /* EM step
1037       combineEnvGenMultiCovCompVecs(yRandsDataMultiCovCompVecs, yEnvGenUnscaledMultiCovCompVecs,
1038   				    Vegs, MCtrials);
1039       conjGradSolveVmulti(VinvyRandsDataMultiCovCompVecs, false, yRandsDataMultiCovCompVecs,
1040 			  MCtrials+1, snpVCnums, vcXscale2s, Vegs, CGmaxIters, CGtol);
1041       updateVegs(Vegs, VinvyRandsDataMultiCovCompVecs, snpVCnums, vcXscale2s, MCtrials);
1042       */
1043 
1044     cout << "Initial variance parameter guesses:" << endl;
1045     for (int k = 0; k <= VCs; k++)
1046       cout << "Vegs[" << k << "]" << Vegs[k] << endl;
1047     cout << endl;
1048     cout << "Performing initial gradient evaluation" << endl;
1049     // compute gradient
1050     combineEnvGenMultiCovCompVecs(yRandsDataMultiCovCompVecs, yEnvGenUnscaledMultiCovCompVecs,
1051 				  Vegs, MCtrials);
1052     conjGradSolveVmulti(VinvyRandsDataMultiCovCompVecs, false, yRandsDataMultiCovCompVecs,
1053 			MCtrials+1, snpVCnums, vcXscale2s, Vegs, CGmaxIters, CGtol);
1054 
1055     ublas::vector <double> grad = computeMCgrad(VinvyRandsDataMultiCovCompVecs, D, snpVCnums, VCs,
1056 						vcXscale2s, MCtrials);
1057     cout << "grad" << grad << endl << endl;
1058 
1059     const double eta1 = 1e-4, eta2 = 0.99;
1060     const double alpha1 = 0.25, alpha2 = 3.5;
1061     double Delta = 1e100; // initialize step norm bound to large
1062 
1063     const int AImaxIters = 20;
1064     for (int iter = 0; iter < AImaxIters; iter++) {
1065       cout << "-------------------------------------------------------------------------------"
1066 	   << endl << endl;
1067       cout << "Start ITER " << (iter+1) << ": computing AI matrix" << endl;
1068       AI = computeAI(Vegs, VinvyMultiCovCompVecs, snpVCnums, vcXscale2s, CGmaxIters, CGtol);
1069       //cout << "AI" << AI << endl;
1070 
1071       double dLL = -1;
1072       int att, maxAttempts = 5;
1073       bool converged = false;
1074       for (att = 1; att <= maxAttempts; att++) {
1075 	double dLLpred; ublas::vector <double> p;
1076 	vector < ublas::matrix <double> > optVegs =
1077 	  NonlinearOptMulti::constrainedNR(dLLpred, p, Vegs, grad, AI, Delta);
1078 	cout << endl << "Constrained Newton-Raphson optimized variance parameters:" << endl;
1079 	for (int k = 0; k <= VCs; k++)
1080 	  cout << "optVegs[" << k << "]" << optVegs[k] << endl;
1081 	cout << endl;
1082 	cout << "Predicted change in log likelihood: " << dLLpred << endl;
1083 	if (dLLpred < tolLL) {
1084 	  cout << "AI iteration converged: predicted change in log likelihood < tol = " << tolLL
1085 	       << endl;
1086 	  Vegs = optVegs;
1087 	  converged = true;
1088 	  break;
1089 	}
1090 
1091 	cout << endl << "Computing actual (approximate) change in log likelihood" << endl;
1092 	combineEnvGenMultiCovCompVecs(yRandsDataMultiCovCompVecs, yEnvGenUnscaledMultiCovCompVecs,
1093 				      optVegs, MCtrials);
1094 	conjGradSolveVmulti(VinvyRandsDataMultiCovCompVecs, false, yRandsDataMultiCovCompVecs,
1095 			    MCtrials+1, snpVCnums, vcXscale2s, optVegs, CGmaxIters, CGtol);
1096 	ublas::vector <double> optGrad = computeMCgrad(VinvyRandsDataMultiCovCompVecs, D,
1097 	                                               snpVCnums, VCs, vcXscale2s, MCtrials);
1098 	cout << "grad" << optGrad << endl;
1099 	dLL = ublas::inner_prod(p, 0.5*(grad+optGrad));
1100 	cout << endl << "Approximate change in log likelihood: " << dLL
1101 	     << " (attempt " << att << ")" << endl;
1102 
1103 	double rho = dLL / dLLpred;
1104 	if (ublas::norm_2(optGrad) > 2*ublas::norm_2(grad)) {
1105 	  rho = -1;
1106 	  cout << "Large increase in grad norm: dangerous model deviation?  Setting rho=-1"
1107 	       << endl;
1108 	}
1109 
1110 	cout << "rho (approximate / predicted change in LL) = " << rho << endl;
1111 	cout << "Old trust region radius: " << Delta << endl;
1112 
1113 	// update trust region radius
1114 	ublas::vector <double> Dp = p; // scale step coordinates using diagonal of AI matrix
1115 	for (int par = 0; par < numPars; par++)
1116 	  Dp(par) *= AI(par, par);
1117 
1118 	if (rho < eta1) // bad step: reduce trust region
1119 	  Delta = alpha1 * ublas::norm_2(Dp);
1120 	else if (rho < eta2) // ok step: do nothing
1121 	  ;
1122 	else // great step: expand trust region
1123 	  Delta = std::max(Delta, alpha2 * ublas::norm_2(Dp));
1124 	cout << "New trust region radius: " << Delta << endl;
1125 
1126 	if (rho > eta1) { // accept step and exit inner loop
1127 	  cout << "Accepted step" << endl;
1128 	  Vegs = optVegs;
1129 	  grad = optGrad;
1130 	  break;
1131 	}
1132 	else {
1133 	  cout << "Rejected step" << endl;
1134 	}
1135       }
1136 
1137       if (converged)
1138 	break;
1139       else if (dLL < 0) {
1140 	cerr << "WARNING: Failed to accept step in " << maxAttempts << " attempts" << endl;
1141 	cerr << "         Stopping AI iteration, but optimization may not have converged" << endl;
1142 	break;
1143       }
1144       else {
1145 	cout << endl << "End ITER " << (iter+1) << endl;
1146 	for (int k = 0; k <= VCs; k++)
1147 	  cout << "Vegs[" << k << "]" << Vegs[k] << endl;
1148 	cout << endl;
1149       }
1150     }
1151 
1152     ALIGNED_FREE(VinvyRandsDataMultiCovCompVecs);
1153     ALIGNED_FREE(yRandsDataMultiCovCompVecs);
1154     ALIGNED_FREE(yEnvGenUnscaledMultiCovCompVecs);
1155 
1156     }
1157 
1158     cout << endl;
1159     ublas::matrix <double> AIinv = MatrixUtils::invert(AI);
1160     cout << "AIinv" << AIinv << endl << endl;
1161 
1162     int curPar = 0;
1163     for (int k = 0; k <= VCs; k++) {
1164       cout << "Variance component " << k << ": " << Vegs[k] << endl;
1165       for (uint64 di = 0; di < D; di++)
1166 	for (uint64 dj = 0; dj <= di; dj++) {
1167 	  printf("  entry (%d,%d): %.6f (%.6f)", (int) dj+1, (int) di+1, Vegs[k](di, dj),
1168 		 sqrt(AIinv(curPar, curPar)));
1169 	  if (di != dj)
1170 	    printf("   corr (%d,%d): %.6f", (int) dj+1, (int) di+1,
1171 		   Vegs[k](di, dj) / sqrt(Vegs[k](di, di) * Vegs[k](dj, dj)));
1172 	  cout << endl;
1173 	  curPar++;
1174 	}
1175     }
1176     cout << endl;
1177 
1178     // apply coordinate transformation to h2, r_g; compute SEs
1179 
1180     double sigma2s[D], h2rgs[1+VCs][D][D]; // transformed pars
1181     int par[1+VCs][D][D]; // indices of parameters in [0..numPars)
1182     ublas::matrix <double> J = ublas::zero_matrix <double> (numPars, numPars);
1183 
1184     // transform off-diagonals from covariances to correlations (to get SEs on corrs)
1185     curPar = 0;
1186     for (int k = 0; k <= VCs; k++)
1187       for (uint64 i = 0; i < D; i++)
1188 	for (uint64 j = 0; j <= i; j++) {
1189 	  // to get point estimates of gen corrs, just divide by sqrt prod
1190 	  h2rgs[k][i][j] = i == j ? Vegs[k](i, j) :
1191 	    Vegs[k](i, j) / sqrt(Vegs[k](i, i) * Vegs[k](j, j));
1192 	  par[k][i][j] = curPar++;
1193 	}
1194     for (int k = 0; k <= VCs; k++)
1195       for (uint64 i = 0; i < D; i++)
1196 	for (uint64 j = 0; j <= i; j++) {
1197 	  if (i == j)
1198 	    J(par[k][i][i], par[k][i][i]) = 1;
1199 	  else {
1200 	    J(par[k][i][j], par[k][i][i]) = Vegs[k](i, j) / (2*Vegs[k](i, i));
1201 	    J(par[k][i][j], par[k][i][j]) = Vegs[k](i, j) / (h2rgs[k][i][j]);
1202 	    J(par[k][i][j], par[k][j][j]) = Vegs[k](i, j) / (2*Vegs[k](j, j));
1203 	  }
1204 	}
1205     ublas::matrix <double> rgAI =
1206       ublas::prod(ublas::matrix <double> (ublas::prod(ublas::trans(J), AI)), J);
1207 
1208     // transform variance coords to: (sigma2 scale parameter, h2_1, h2_2, ..., h2_VCs)
1209     for (int i = 0; i < (int) D; i++) {
1210       // to get point estimates of sigma2 for each trait, sum raw per-VC sigma2s over VCs
1211       sigma2s[i] = 0;
1212       for (int k = 0; k <= VCs; k++)
1213 	sigma2s[i] += Vegs[k](i, i);
1214       // to get point estimate h2s (including env), take fractions (raw sigma2s over VCs) / sum
1215       for (int k = 0; k <= VCs; k++)
1216 	h2rgs[k][i][i] = Vegs[k](i, i) / sigma2s[i];
1217     }
1218 
1219     ublas::matrix <double> h2rgAIinv[2]; h2rgAIinv[0] = h2rgAIinv[1] = rgAI;
1220     // leave out one k (VC): h2s for all other VCs, 1-sum(h2s) for left-out kOut
1221     // kOut = 0: get SEs for sigma2, h2 (for all traits), r_g (for all trait pairs)
1222     // kOut = 1: get SEs for environment/noise h2
1223     for (int kOut = 0; kOut < 2; kOut++) {
1224       for (int i = 0; i < (int) D; i++) { // apply transform to parameters for each trait in turn
1225 	J = ublas::identity_matrix <double> (numPars);
1226 	for (int k = 0; k <= VCs; k++) {
1227 	  if (k != kOut) {
1228 	    J(par[kOut][i][i], par[k][i][i]) = -sigma2s[i];
1229 	    J(par[k][i][i], par[k][i][i]) = sigma2s[i];
1230 	  }
1231 	  J(par[k][i][i], par[kOut][i][i]) = h2rgs[k][i][i];
1232 	}
1233 	// note: these Js don't affect r_g coords
1234 	h2rgAIinv[kOut] =
1235 	  ublas::prod(ublas::matrix <double> (ublas::prod(ublas::trans(J), h2rgAIinv[kOut])), J);
1236       }
1237       h2rgAIinv[kOut] = MatrixUtils::invert(h2rgAIinv[kOut]);
1238     }
1239 
1240     for (int i = 0; i < (int) D; i++)
1241       printf("Phenotype %d variance sigma2: %f (%f)\n", i+1,
1242 	     sigma2s[i] * NumericUtils::sq(phenoNormsCorrs(i, i)),
1243 	     sqrt(h2rgAIinv[0](i, i)) * NumericUtils::sq(phenoNormsCorrs(i, i)));
1244     cout << endl;
1245 
1246     const vector <string> &vcNames = snpData.getVCnames();
1247 
1248     curPar = 0;
1249     for (int k = 0; k <= VCs; k++) {
1250       cout << "Variance component " << k << ": ";
1251       if (k == 0) cout << " (environment/noise)" << endl;
1252       else cout << " \"" << vcNames[k] << "\"" << endl;
1253       for (uint64 di = 0; di < D; di++)
1254 	for (uint64 dj = 0; dj <= di; dj++) {
1255 	  if (di == dj)
1256 	    printf("  h2%c (%d,%d): %.6f (%.6f)", k==0?'e':'g', (int) dj+1, (int) di+1,
1257 		   h2rgs[k][di][dj], sqrt((1+1.0/MCtrials) * h2rgAIinv[k==0](curPar, curPar)));
1258 	  else
1259 	    printf("  %s corr (%d,%d): %.6f (%.6f)", k==0?"resid":"gen",
1260 		   (int) dj+1, (int) di+1, h2rgs[k][di][dj],
1261 		   sqrt((1+1.0/MCtrials) * h2rgAIinv[0](curPar, curPar)));
1262 	  cout << endl;
1263 	  curPar++;
1264 	}
1265     }
1266 
1267       /*
1268       vector <int> digits(VCs);
1269       for (int v = 0; v < VCs; v++)
1270 	digits[v] = std::min(10, std::max(2, (int) -log10(std::max(evalData.xSEs(v), 1e-10)) + 2));
1271       int maxDigits = *std::max_element(digits.begin(), digits.end());
1272       for (int v = 0; v < VCs; v++) {
1273 	char format[100];
1274 
1275 	char h2buf[100];
1276 	sprintf(format, "%%.%df", digits[v]);
1277 	sprintf(h2buf, format, x[v]);
1278 
1279 	char h2SEbuf[100];
1280 	sprintf(format, "(%%.%df)", digits[v]);
1281 	sprintf(h2SEbuf, format, evalData.xSEs(v));
1282 
1283 	sprintf(format, "  Variance component %%d:   %%-%ds %%-%ds", maxDigits+2, maxDigits+4);
1284 	printf(format, v+1, h2buf, h2SEbuf);
1285 	cout << "   \"" << snpData.getVCnames()[v+1] << "\"" << endl;
1286       }
1287       */
1288 
1289     cout << endl;
1290   }
1291 
1292 }
1293