1 /*
2    This file is part of the BOLT-LMM linear mixed model software package
3    developed by Po-Ru Loh.  Copyright (C) 2014-2019 Harvard University.
4 
5    This program is free software: you can redistribute it and/or modify
6    it under the terms of the GNU General Public License as published by
7    the Free Software Foundation, either version 3 of the License, or
8    (at your option) any later version.
9 
10    This program is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13    GNU General Public License for more details.
14 
15    You should have received a copy of the GNU General Public License
16    along with this program.  If not, see <http://www.gnu.org/licenses/>.
17 */
18 
19 #include <cstring>
20 #include <cmath>
21 #include <cassert>
22 #include <iomanip>
23 #include <iostream>
24 #include <fstream>
25 #include <sstream>
26 #include <map>
27 
28 #ifdef USE_SSE
29 #include <emmintrin.h> // SSE2 for packed doubles
30 #endif
31 
32 #include "Types.hpp"
33 #include "SnpData.hpp"
34 #include "SnpInfo.hpp"
35 #include "RestrictSnpSet.hpp"
36 #include "FileUtils.hpp"
37 #include "MemoryUtils.hpp"
38 #include "MapInterpolater.hpp"
39 #include "LapackConst.hpp"
40 
41 namespace LMM {
42 
43   using std::vector;
44   using std::string;
45   using std::cout;
46   using std::cerr;
47   using std::endl;
48   using FileUtils::getline;
49 
50   const uint64 SnpData::IND_MISSING = (uint64) -1;
51 
chrStrToInt(const string & chrom,int Nauto)52   int SnpData::chrStrToInt(const string &chrom, int Nauto) {
53     if (isdigit(chrom[0])) {
54       int chr = atoi(chrom.c_str());
55       if (chr>=1 && chr<=Nauto+1) return chr;
56       return -1;
57     }
58     if (chrom == "X" || chrom == "XY" || chrom == "PAR1" || chrom == "PAR2") return Nauto+1;
59     return -1;
60   }
61 
62   /**
63    * work: 256x4 aligned work array
64    * lookupBedCode[4] = {value of 0, value of missing, value of 1, value of 2}
65    */
buildByteLookup(double (* work)[4],const double lookupBedCode[4]) const66   void SnpData::buildByteLookup(double (*work)[4], const double lookupBedCode[4]) const {
67     for (int byte4 = 0; byte4 < 256; byte4 += 4) {
68       for (int k = 0; k < 4; k++) // fill 4 values for first of 4 consecutive bytes
69 	work[byte4][k] = lookupBedCode[(byte4>>(k+k))&3];
70       for (int k = 1; k < 4; k++) {
71 	memcpy(work[byte4+k], work[byte4], sizeof(work[0]));
72 	work[byte4+k][0] = lookupBedCode[k];
73       }
74     }
75     /* slow? way
76     for (int byte = 0; byte < 256; byte++) {
77       for (int k = 0; k < 4; k++)
78 	work[byte][k] = lookupBedCode[(byte>>(k+k))&3];
79     }
80     */
81   }
82 
estChipLDscoresChrom(vector<double> & chipLDscores,int chrom,const vector<int> & indivInds) const83   void SnpData::estChipLDscoresChrom(vector <double> &chipLDscores, int chrom,
84 				     const vector <int> &indivInds) const {
85     uint64 chrStart = 0;
86     while (chrStart < M && snps[chrStart].chrom != chrom) chrStart++;
87     uint64 chrEnd = chrStart;
88     while (chrEnd < M && snps[chrEnd].chrom == chrom) chrEnd++;
89     uint64 Mchr = chrEnd - chrStart;
90     if (Mchr == 0) return;
91     uint64 Nsub = indivInds.size();
92 
93     //cout << "Estimating chip LD Scores on chromosome " << chrom << endl;
94     for (uint64 m = chrStart; m < chrEnd; m++) chipLDscores[m] = 1.0;
95 
96     // allocate memory
97     uchar *chrMaskSnps = ALIGNED_MALLOC_UCHARS(Mchr);
98     memset(chrMaskSnps, 0, Mchr * sizeof(chrMaskSnps[0]));
99     float *chrNormalizedGenos = ALIGNED_MALLOC_FLOATS(Mchr * Nsub);
100     memset(chrNormalizedGenos, 0, Mchr * Nsub * sizeof(chrNormalizedGenos[0]));
101     const int mBlock = 64;
102     float *dotProds = ALIGNED_MALLOC_FLOATS(Mchr * mBlock);
103 
104     // fill and normalize genotypes
105     for (uint64 mchr = 0; mchr < Mchr; mchr++) {
106       uint64 m = chrStart + mchr;
107       if (maskSnps[m])
108 	chrMaskSnps[mchr] = fillSnpSubRowNorm1(chrNormalizedGenos + mchr*Nsub, m, indivInds);
109     }
110 
111     uint64 mchrWindowStart = 0;
112     for (uint64 mchr0 = 0; mchr0 < Mchr; mchr0 += mBlock) { // sgemm to compute r2s
113       uint64 mBlockCrop = std::min(Mchr, mchr0+mBlock) - mchr0;
114       while (!isProximal(chrStart + mchrWindowStart, chrStart + mchr0, 0.01, 1000000))
115 	mchrWindowStart++;
116       uint64 prevWindowSize = mchr0+mBlockCrop-1 - mchrWindowStart;
117 
118       // [mchrWindowStart..mchr0+mBlockCrop-1) x [mchr0..mchr0+mBlockCrop)
119       {
120 	char TRANSA_ = 'T';
121 	char TRANSB_ = 'N';
122 	int M_ = prevWindowSize;
123 	int N_ = mBlockCrop;
124 	int K_ = Nsub;
125 	float ALPHA_ = 1;
126 	float *A_ = chrNormalizedGenos + mchrWindowStart*Nsub;
127 	int LDA_ = Nsub;
128 	float *B_ = chrNormalizedGenos + mchr0*Nsub;
129 	int LDB_ = Nsub;
130 	float BETA_ = 0;
131 	float *C_ = dotProds;
132 	int LDC_ = prevWindowSize;
133 	SGEMM_MACRO(&TRANSA_, &TRANSB_, &M_, &N_, &K_, &ALPHA_, A_, &LDA_, B_, &LDB_,
134 		    &BETA_, C_, &LDC_);
135       }
136 
137       for (uint64 mPlus = 0; mPlus < mBlockCrop; mPlus++) {
138 	uint64 m = chrStart + mchr0 + mPlus;
139 	if (!chrMaskSnps[m-chrStart]) continue;
140 	for (uint64 mPlus2 = 0; mchrWindowStart+mPlus2 < mchr0+mPlus; mPlus2++) {
141 	  uint64 m2 = chrStart + mchrWindowStart + mPlus2;
142 	  if (!chrMaskSnps[m2-chrStart]) continue;
143 	  float adjR2 = dotProdToAdjR2(dotProds[mPlus2 + mPlus*prevWindowSize], Nsub);
144 	  chipLDscores[m] += adjR2;
145 	  chipLDscores[m2] += adjR2;
146 	}
147       }
148     }
149 
150     ALIGNED_FREE(dotProds);
151     ALIGNED_FREE(chrNormalizedGenos);
152     ALIGNED_FREE(chrMaskSnps);
153   }
154 
dotProdToAdjR2(float dotProd,int n)155   float SnpData::dotProdToAdjR2(float dotProd, int n) {
156     float r2 = dotProd*dotProd;
157     return r2 - (1-r2)/(n-2);
158   }
159 
160   /**
161    * fills x[] with indivInds.size() elements corresponding to chosen subset of indivInds
162    * replaces missing values with average; mean-centers and normalizes vector length to 1
163    * if monomorphic among non-missing, fills with all-0s
164    *
165    * return: true if snp is polymorphic in indivInds; false if not
166    */
fillSnpSubRowNorm1(float x[],uint64 m,const vector<int> & indivInds) const167   bool SnpData::fillSnpSubRowNorm1(float x[], uint64 m, const vector <int> &indivInds) const {
168     /* lookupBedCode[4] = {value of 0, value of missing, value of 1, value of 2} */
169     const float lookupBedCode[4] = {0.0f, 9.0f, 1.0f, 2.0f};
170     float sumPresent = 0; int numPresent = 0;
171     for (uint64 i = 0; i < indivInds.size(); i++) {
172       uint64 n = indivInds[i];
173       uchar byte = genotypes[m * (Nstride>>2) + (n>>2)];
174       int k = n&3;
175       x[i] = lookupBedCode[(byte>>(k+k))&3];
176       if (x[i] != 9.0f) {
177 	sumPresent += x[i];
178 	numPresent++;
179       }
180     }
181     float avg = sumPresent / numPresent;
182     float sum2 = 0;
183     for (uint64 i = 0; i < indivInds.size(); i++) {
184       if (x[i] != 9.0f) { // non-missing; mean-center
185 	x[i] -= avg;
186 	sum2 += x[i]*x[i];
187       }
188       else // missing; replace with mean (centered to 0)
189 	x[i] = 0;
190     }
191     if (sum2 < 0.001) { // monomorphic among non-missing
192       for (uint64 i = 0; i < indivInds.size(); i++) x[i] = 0; // set to 0
193       return false;
194     }
195     else { // polymorphic
196       float invNorm = 1.0f / sqrtf(sum2);
197       for (uint64 i = 0; i < indivInds.size(); i++) x[i] *= invNorm; // normalize to vector len 1
198       return true;
199     }
200   }
201 
processIndivs(const string & famFile,const vector<string> & removeFiles)202   void SnpData::processIndivs(const string &famFile, const vector <string> &removeFiles) {
203     string line;
204 
205     vector <IndivInfo> bedIndivs;
206     FileUtils::AutoGzIfstream fin; fin.openOrExit(famFile);
207     while (getline(fin, line)) {
208       std::istringstream iss(line);
209       IndivInfo indiv;
210       if (!(iss >> indiv.famID >> indiv.indivID >> indiv.paternalID >> indiv.maternalID
211 	    >> indiv.sex >> indiv.pheno)) {
212 	cerr << "ERROR: Incorrectly formatted fam file: " << famFile << endl;
213 	cerr << "Line " << bedIndivs.size()+1 << ":" << endl;
214 	cerr << line << endl;
215 	cerr << "Unable to input 6 values (4 string, 1 int, 1 double)" << endl;
216 	exit(1);
217       }
218       string combined_ID = indiv.famID + " " + indiv.indivID;
219       if (FID_IID_to_ind.find(combined_ID) != FID_IID_to_ind.end()) {
220 	cerr << "ERROR: Duplicate individual in fam file at line " << bedIndivs.size()+1 << endl;
221 	exit(1);
222       }
223       FID_IID_to_ind[combined_ID] = bedIndivs.size();
224       bedIndivs.push_back(indiv);
225     }
226     fin.close();
227     Nbed = bedIndivs.size();
228 
229     cout << "Total indivs in PLINK data: Nbed = " << Nbed << endl;
230 
231     // process individuals to remove
232     vector <bool> useIndiv(Nbed, true);
233     for (uint f = 0; f < removeFiles.size(); f++) {
234       const string &removeFile = removeFiles[f];
235       cout << "Reading remove file (indivs to remove): " << removeFile << endl;
236       fin.openOrExit(removeFile);
237       int lineCtr = 0;
238       int numRemoved = 0;
239       int numAbsent = 0;
240       while (getline(fin, line)) {
241 	lineCtr++;
242 	std::istringstream iss(line);
243 	string FID, IID;
244 	if (!(iss >> FID >> IID)) {
245 	  cerr << "ERROR: Incorrectly formatted remove file: " << removeFile << endl;
246 	  cerr << "Line " << lineCtr << ":" << endl;
247 	  cerr << line << endl;
248 	  cerr << "Unable to input FID and IID" << endl;
249 	  exit(1);
250 	}
251 	string combined_ID = FID + " " + IID;
252 	if (FID_IID_to_ind.find(combined_ID) == FID_IID_to_ind.end()) {
253 	  if (numAbsent < 5)
254 	    cerr << "WARNING: Unable to find individual to remove: " << combined_ID << endl;
255 	  numAbsent++;
256 	}
257 	else if (useIndiv[FID_IID_to_ind[combined_ID]]) {
258 	  useIndiv[FID_IID_to_ind[combined_ID]] = false;
259 	  numRemoved++;
260 	}
261       }
262       fin.close();
263       cout << "Removed " << numRemoved << " individual(s)" << endl;
264       if (numAbsent)
265 	cerr << "WARNING: " << numAbsent << " individual(s) not found in data set" << endl;
266     }
267 
268     // determine number of indivs remaining post-removal and set up indices
269     bedIndivToRemoveIndex.resize(Nbed);
270     FID_IID_to_ind.clear(); // redo FID_IID -> indiv index
271     for (uint64 nbed = 0; nbed < Nbed; nbed++) {
272       if (useIndiv[nbed]) {
273 	bedIndivToRemoveIndex[nbed] = indivs.size();
274 	FID_IID_to_ind[bedIndivs[nbed].famID + " " + bedIndivs[nbed].indivID] = indivs.size();
275 	indivs.push_back(bedIndivs[nbed]);
276       }
277       else
278 	bedIndivToRemoveIndex[nbed] = -1;
279     }
280     N = indivs.size();
281     cout << "Total indivs stored in memory: N = " << N << endl;
282 
283     // allocate and initialize maskIndivs to all good (aside from filler at end)
284     Nstride = (N+3)&~3;
285     maskIndivs = ALIGNED_MALLOC_DOUBLES(Nstride);
286     for (uint64 n = 0; n < N; n++) maskIndivs[n] = 1;
287     for (uint64 n = N; n < Nstride; n++) maskIndivs[n] = 0;
288   }
289 
processSnps(vector<uint64> & Mfiles,const vector<string> & bimFiles,const vector<string> & excludeFiles,const vector<string> & modelSnpsFiles,const vector<string> & vcNamesIn,bool loadNonModelSnps)290   vector <SnpInfo> SnpData::processSnps(vector <uint64> &Mfiles, const vector <string> &bimFiles,
291 					const vector <string> &excludeFiles,
292 					const vector <string> &modelSnpsFiles,
293 					const vector <string> &vcNamesIn, bool loadNonModelSnps) {
294     FileUtils::AutoGzIfstream fin;
295     string line;
296 
297     vector <SnpInfo> bedSnps;
298     // read bim files
299     for (uint i = 0; i < bimFiles.size(); i++) {
300       cout << "Reading bim file #" << (i+1) << ": " << bimFiles[i] << endl;
301       vector <SnpInfo> snps_i = readBimFile(bimFiles[i], Nautosomes);
302       bedSnps.insert(bedSnps.end(), snps_i.begin(), snps_i.end());
303       Mfiles.push_back(snps_i.size());
304       cout << "    Read " << Mfiles.back() << " snps" << endl;
305     }
306     Mbed = bedSnps.size();
307 
308     cout << "Total snps in PLINK data: Mbed = " << Mbed << endl;
309 
310     // >=1 = in GRM (model), 0 = not in GRM, -1 = exclude
311     const int excludeVCnum = -1;
312     const int nonGrmVCnum = 0;
313     const int firstVCnum = 1;
314     // if list of GRM snps, default = no
315     const int defaultVCnum = modelSnpsFiles.empty() ? firstVCnum : nonGrmVCnum;
316     vector <int> snpVCnum(Mbed, (char) defaultVCnum);
317     if (vcNamesIn.empty()) {
318       vcNames = vector <string> (1, "env/noise"); // 0th entry of vcNames is ignored; VCs 1-indexed
319       if (modelSnpsFiles.empty()) // if modelSnpsFiles are given, vcNames will be populated later
320 	vcNames.push_back("modelSnps"); // if not, put all non-excluded SNPs in one variance comp
321     }
322     else
323       vcNames = vcNamesIn;
324 
325     // check for duplicate snps
326     {
327       std::set <string> rsIDs;
328       for (uint64 mbed = 0; mbed < Mbed; mbed++) {
329 	if (rsIDs.find(bedSnps[mbed].ID) != rsIDs.end()) {
330 	  cerr << "WARNING: Duplicate snp ID " << bedSnps[mbed].ID
331 	       << " -- masking duplicate" << endl;
332 	  snpVCnum[mbed] = excludeVCnum;
333 	}
334 	else
335 	  rsIDs.insert(bedSnps[mbed].ID);
336       }
337     }
338 
339     // exclude snps if list provided; restrict GRM snps if list provided
340     if (!excludeFiles.empty() || !modelSnpsFiles.empty()) {
341 
342       // create dictionary rsID -> index in full bed snp list
343       std::map <string, uint64> rsID_to_ind;
344       for (uint64 mbed = 0; mbed < Mbed; mbed++)
345 	if (rsID_to_ind.find(bedSnps[mbed].ID) == rsID_to_ind.end()) // only use first of dupe IDs
346 	  rsID_to_ind[bedSnps[mbed].ID] = mbed;
347 
348       // exclude snps
349       for (uint f = 0; f < excludeFiles.size(); f++) {
350 	const string &excludeFile = excludeFiles[f];
351 	cout << "Reading exclude file (SNPs to exclude): " << excludeFile << endl;
352 	fin.openOrExit(excludeFile);
353 	int numExcluded = 0;
354 	int numAbsent = 0;
355 	while (getline(fin, line)) {
356 	  std::istringstream iss(line);
357 	  string rsID; iss >> rsID;
358 	  if (rsID_to_ind.find(rsID) == rsID_to_ind.end()) {
359 	    if (numAbsent < 5)
360 	      cerr << "WARNING: Unable to find SNP to exclude: " << rsID << endl;
361 	    numAbsent++;
362 	  }
363 	  else if (snpVCnum[rsID_to_ind[rsID]] != excludeVCnum) {
364 	    snpVCnum[rsID_to_ind[rsID]] = excludeVCnum;
365 	    numExcluded++;
366 	  }
367 	}
368 	fin.close();
369 	cout << "Excluded " << numExcluded << " SNP(s)" << endl;
370 	if (numAbsent)
371 	  cerr << "WARNING: " << numAbsent << " SNP(s) not found in data set" << endl;
372       }
373 
374       // include GRM snps listed
375       std::map <string, int> vcNamesToInds;
376       if (!vcNamesIn.empty()) // initialize map
377 	for (int v = 1; v < (int) vcNamesIn.size(); v++)
378 	  vcNamesToInds[vcNamesIn[v]] = v;
379 
380       for (uint f = 0; f < modelSnpsFiles.size(); f++) {
381 	const string &modelSnpsFile = modelSnpsFiles[f];
382 	cout << "Reading list of SNPs to include in model (i.e., GRM): " << modelSnpsFile << endl;
383 	fin.openOrExit(modelSnpsFile);
384 	int numIncluded = 0;
385 	int numAlreadyExcluded = 0;
386 	int numAlreadyAssigned = 0;
387 	int numAbsent = 0;
388 	while (getline(fin, line)) {
389 	  std::istringstream iss(line);
390 	  string rsID; iss >> rsID;
391 	  if (rsID_to_ind.find(rsID) == rsID_to_ind.end()) {
392 	    if (numAbsent < 5)
393 	      cerr << "WARNING: Unable to find SNP to include in model: " << rsID << endl;
394 	    numAbsent++;
395 	  }
396 	  else if (snpVCnum[rsID_to_ind[rsID]] == excludeVCnum) {
397 	    if (numAlreadyExcluded < 5)
398 	      cerr << "WARNING: SNP has been excluded: " << rsID << endl;
399 	    numAlreadyExcluded++;
400 	  }
401 	  else if (snpVCnum[rsID_to_ind[rsID]] == nonGrmVCnum) {
402 	    string vcName; iss >> vcName; // ok if all lines have no other fields; then vcName=""
403 	    if (vcName.empty()) vcName = "modelSnps";
404 	    if (vcNamesToInds.find(vcName) == vcNamesToInds.end()) {
405 	      if (vcNamesIn.empty()) {
406 		vcNamesToInds[vcName] = vcNames.size();
407 		vcNames.push_back(vcName);
408 	      }
409 	      else {
410 		cerr << "ERROR: SNP " << rsID << " is assigned to VC '" << vcName
411 		     << "' not in --remlGuessStr" << endl;
412 		exit(1);
413 	      }
414 	    }
415 	    int vcNum = vcNamesToInds[vcName];
416 	    if (vcNum > SnpInfo::MAX_VC_NUM) {
417 	      cerr << "ERROR: Too many distinct variance component names (2nd column); max = "
418 		   << SnpInfo::MAX_VC_NUM << endl;
419 	      exit(1);
420 	    }
421 	    snpVCnum[rsID_to_ind[rsID]] = vcNum;
422 	    numIncluded++;
423 	  }
424 	  else {
425 	    if (numAlreadyAssigned < 5)
426 	      cerr << "WARNING: SNP was already assigned to a variance comp: " << rsID << endl;
427 	    numAlreadyAssigned++;
428 	  }
429 	}
430 	fin.close();
431 	cout << "Included " << numIncluded << " SNP(s) in model in "
432 	     << vcNames.size()-1 << " variance component(s)"<< endl;
433 	if (numAbsent)
434 	  cerr << "WARNING: " << numAbsent << " SNP(s) not found in data set" << endl;
435 	if (numAlreadyExcluded)
436 	  cerr << "WARNING: " << numAlreadyExcluded << " SNP(s) had been excluded" << endl;
437 	if (numAlreadyAssigned)
438 	  cerr << "WARNING: " << numAlreadyAssigned << " SNP(s) were multiply assigned" << endl;
439       }
440     }
441 
442     // determine number of snps remaining post-exclusion and set up index
443     M = 0; // note: M will be updated later after further QC
444     uint64 Mexclude = 0, MnonGRM = 0;
445     bedSnpToGrmIndex.resize(Mbed); // note: this index will be further updated after QC
446     for (uint64 mbed = 0; mbed < Mbed; mbed++) {
447       int vcNum = snpVCnum[mbed];
448       bedSnps[mbed].vcNum = vcNum;
449       if (vcNum >= firstVCnum) {
450 	bedSnpToGrmIndex[mbed] = (int) M;
451 	M++;
452       }
453       else if (vcNum == nonGrmVCnum && loadNonModelSnps) {
454 	bedSnpToGrmIndex[mbed] = -1;
455 	MnonGRM++;
456       }
457       else if (vcNum == nonGrmVCnum || vcNum == excludeVCnum) {
458 	bedSnpToGrmIndex[mbed] = -2;
459 	Mexclude++;
460       }
461       else assert(false); // shouldn't be any other possibilities
462     }
463     cout << endl;
464     cout << "Breakdown of SNP pre-filtering results:" << endl;
465     cout << "  " << M << " SNPs to include in model (i.e., GRM)" << endl;
466     cout << "  " << MnonGRM << " additional non-GRM SNPs loaded" << endl;
467     cout << "  " << Mexclude << " excluded SNPs" << endl;
468     /*
469     if (M < 10) {
470       cerr << "ERROR: Very few SNPs included in model; probably an input error" << endl;
471       exit(1);
472     }
473     */
474     return bedSnps;
475   }
476 
processMap(vector<SnpInfo> & bedSnps,const string & geneticMapFile,bool noMapCheck)477   void SnpData::processMap(vector <SnpInfo> &bedSnps, const string &geneticMapFile,
478 			   bool noMapCheck) {
479     // fill in map if external file provided
480     if (!geneticMapFile.empty()) {
481       cout << "Filling in genetic map coordinates using reference file:" << endl;
482       cout << "  " << geneticMapFile << endl;
483       MapInterpolater mapInterpolater(geneticMapFile);
484       for (uint64 mbed = 0; mbed < Mbed; mbed++)
485 	if (bedSnpToGrmIndex[mbed] != -2)
486 	  bedSnps[mbed].genpos =
487 	    mapInterpolater.interp(bedSnps[mbed].chrom, bedSnps[mbed].physpos);
488     }
489 
490     // check map and rescale if in cM units: calculate genpos/physpos for last autosomal snp
491     mapAvailable = false;
492     for (int mbedLast = (int) Mbed-1; mbedLast >= 0; mbedLast--)
493       if (bedSnpToGrmIndex[mbedLast] != -2 && bedSnps[mbedLast].chrom <= Nautosomes+1) {
494 	double scale = bedSnps[mbedLast].genpos / bedSnps[mbedLast].physpos;
495 	if (scale == 0)
496 	  cerr << "WARNING: No genetic map provided; using physical positions only" << endl;
497 	else if (0.5e-6 < scale && scale < 2e-6) {
498 	  cerr << "WARNING: Genetic map appears to be in cM units; rescaling by 0.01" << endl;
499 	  for (uint64 mbed = 0; mbed < Mbed; mbed++)
500 	    bedSnps[mbed].genpos *= 0.01;
501 	  mapAvailable = true;
502 	}
503 	else if (0.5e-8 < scale && scale < 2e-8)
504 	  mapAvailable = true;
505 	else {
506 	  if (noMapCheck) {
507 	    cerr << "WARNING: Genetic map appears wrong based on last genpos/bp" << endl;
508 	    cerr << "         Proceeding anyway because --noMapCheck is set" << endl;
509 	    mapAvailable = true;
510 	  }
511 	  else {
512 	    cerr << "ERROR: Genetic map appears wrong based on last genpos/bp" << endl;
513 	    cerr << "       To proceed anyway, set --noMapCheck" << endl;
514 	    exit(1);
515 	  }
516 	}
517 	break;
518       }
519   }
520 
storeBedLine(uchar bedLineOut[],const uchar genoLine[])521   void SnpData::storeBedLine(uchar bedLineOut[], const uchar genoLine[]) {
522     const int genoToBed[10] = {3, 2, 0, 0, 0, 0, 0, 0, 0, 1};
523     memset(bedLineOut, 0, (Nstride>>2) * sizeof(bedLineOut[0]));
524     for (uint64 n = 0; n < N; n++)
525       bedLineOut[n>>2] = (uchar) (bedLineOut[n>>2] | genoToBed[genoLine[n]]<<((n&3)<<1));
526   }
527 
528   /**
529    * assumes Nbed and bedIndivToRemoveIndex have been initialized
530    * if loadGenoLine == false, just advances the file pointer
531    */
readBedLine(uchar genoLine[],uchar bedLineIn[],FileUtils::AutoGzIfstream & fin,bool loadGenoLine) const532   void SnpData::readBedLine(uchar genoLine[], uchar bedLineIn[], FileUtils::AutoGzIfstream &fin,
533 			    bool loadGenoLine) const {
534     fin.read((char *) bedLineIn, (Nbed+3)>>2);
535     if (loadGenoLine) {
536       const int bedToGeno[4] = {2, 9, 1, 0};
537       for (uint64 nbed = 0; nbed < Nbed; nbed++)
538 	if (bedIndivToRemoveIndex[nbed] != -1) {
539 	  int genoValue = bedToGeno[(bedLineIn[nbed>>2]>>((nbed&3)<<1))&3];
540 	  genoLine[bedIndivToRemoveIndex[nbed]] = (uchar) genoValue;
541 	}
542     }
543   }
544 
dosageValid(double dosage)545   bool SnpData::dosageValid(double dosage) {
546     const double eps = 1e-6;
547     return -eps <= dosage && dosage <= 2+eps;
548   }
549 
computeAlleleFreq(const uchar genoLine[],const double subMaskIndivs[]) const550   double SnpData::computeAlleleFreq(const uchar genoLine[], const double subMaskIndivs[]) const {
551     double sum = 0; int num = 0;
552     for (size_t n = 0; n < N; n++)
553       if (subMaskIndivs[n] && genoLine[n] != 9) {
554 	sum += genoLine[n];
555 	num++;
556       }
557     return 0.5 * sum / num;
558   }
computeAlleleFreq(const double dosageLine[],const double subMaskIndivs[]) const559   double SnpData::computeAlleleFreq(const double dosageLine[], const double subMaskIndivs[])
560     const {
561     double sum = 0; int num = 0;
562     for (size_t n = 0; n < N; n++)
563       if (subMaskIndivs[n] && dosageValid(dosageLine[n])) {
564 	sum += dosageLine[n];
565 	num++;
566       }
567     return 0.5 * sum / num;
568   }
569 
computeMAF(const uchar genoLine[],const double subMaskIndivs[]) const570   double SnpData::computeMAF(const uchar genoLine[], const double subMaskIndivs[]) const {
571     double alleleFreq = computeAlleleFreq(genoLine, subMaskIndivs);
572     return std::min(alleleFreq, 1.0-alleleFreq);
573   }
574 
computeSnpMissing(const uchar genoLine[],const double subMaskIndivs[]) const575   double SnpData::computeSnpMissing(const uchar genoLine[], const double subMaskIndivs[]) const {
576     double sum = 0; int num = 0;
577     for (uint64 n = 0; n < N; n++)
578       if (subMaskIndivs[n]) {
579 	sum += (genoLine[n] == 9);
580 	num++;
581       }
582     return sum / num;
583   }
computeSnpMissing(const double dosageLine[],const double subMaskIndivs[]) const584   double SnpData::computeSnpMissing(const double dosageLine[], const double subMaskIndivs[])
585     const {
586     double sum = 0; int num = 0;
587     for (uint64 n = 0; n < N; n++)
588       if (subMaskIndivs[n]) {
589 	sum += !dosageValid(dosageLine[n]);
590 	num++;
591       }
592     return sum / num;
593   }
594 
595   // assumes maskedSnpVector has dimension Nstride; zero-fills
596   // note alleleFreq != MAF: alleleFreq = (mean allele count) / 2 and has full range [0..1]!
genoLineToMaskedSnpVector(double maskedSnpVector[],const uchar genoLine[],const double subMaskIndivs[],double alleleFreq) const597   void SnpData::genoLineToMaskedSnpVector(double maskedSnpVector[], const uchar genoLine[],
598 					  const double subMaskIndivs[], double alleleFreq) const {
599     for (size_t n = 0; n < N; n++) {
600       if (subMaskIndivs[n] && genoLine[n] != 9)
601 	maskedSnpVector[n] = genoLine[n] - 2*alleleFreq;
602       else
603 	maskedSnpVector[n] = 0;
604     }
605     for (uint64 n = N; n < Nstride; n++)
606       maskedSnpVector[n] = 0;
607   }
608   // assumes maskedSnpVector has dimension Nstride; zero-fills
609   // note alleleFreq != MAF: alleleFreq = (mean allele count) / 2 and has full range [0..1]!
dosageLineToMaskedSnpVector(double dosageLineVec[],const double subMaskIndivs[],double alleleFreq) const610   void SnpData::dosageLineToMaskedSnpVector(double dosageLineVec[], const double subMaskIndivs[],
611 					    double alleleFreq) const {
612     for (size_t n = 0; n < N; n++) {
613       if (subMaskIndivs[n] && dosageValid(dosageLineVec[n]))
614 	dosageLineVec[n] = dosageLineVec[n] - 2*alleleFreq;
615       else
616 	dosageLineVec[n] = 0;
617     }
618     for (uint64 n = N; n < Nstride; n++)
619       dosageLineVec[n] = 0;
620   }
621 
622   /**
623    * reads indiv info from fam file, snp info from bim file
624    * allocates memory, reads genotypes, and does QC
625    * assumes numbers of bim and bed files match
626    */
SnpData(const string & famFile,const vector<string> & bimFiles,const vector<string> & bedFiles,const string & geneticMapFile,const vector<string> & excludeFiles,const vector<string> & modelSnpsFiles,const vector<string> & removeFiles,double maxMissingPerSnp,double maxMissingPerIndiv,bool noMapCheck,vector<string> vcNamesIn,bool loadNonModelSnps,int _Nautosomes)627   SnpData::SnpData(const string &famFile, const vector <string> &bimFiles,
628 		   const vector <string> &bedFiles, const string &geneticMapFile,
629 		   const vector <string> &excludeFiles, const vector <string> &modelSnpsFiles,
630 		   const vector <string> &removeFiles,
631 		   double maxMissingPerSnp, double maxMissingPerIndiv, bool noMapCheck,
632 		   vector <string> vcNamesIn, bool loadNonModelSnps, int _Nautosomes)
633     : Nautosomes(_Nautosomes) {
634 
635     processIndivs(famFile, removeFiles);
636     // bedSnps = all snps in PLINK data; will filter and QC to create class member 'snps'
637     vector <uint64> Mfiles;
638     vector <SnpInfo> bedSnps = processSnps(Mfiles, bimFiles, excludeFiles, modelSnpsFiles,
639 					   vcNamesIn, loadNonModelSnps);
640     processMap(bedSnps, geneticMapFile, noMapCheck);
641 
642     // allocate genotypes
643     cout << "Allocating " << M << " x " << Nstride << "/4 bytes to store genotypes" << endl;
644     genotypes = ALIGNED_MALLOC_UCHARS(M * Nstride/4); // note: M will be reduced after QC
645     numIndivsQC = N;
646 
647     cout << "Reading genotypes and performing QC filtering on snps and indivs..." << endl;
648 
649     // read bed files; build final vector <SnpInfo> snps
650     vector <int> numMissingPerIndiv(N);
651     uchar *bedLineOut = genotypes;
652     uint64 mbed = 0;
653 
654     for (uint i = 0; i < bedFiles.size(); i++) {
655       if (Mfiles[i] == 0) continue;
656       uint64 bytesInFile = Mfiles[i] * (uint64) ((Nbed+3)>>2);
657       cout << "Reading bed file #" << (i+1) << ": " << bedFiles[i] << endl;
658       cout << "    Expecting " << bytesInFile << " (+3) bytes for "
659 		<< Nbed << " indivs, " << Mfiles[i] << " snps" << endl;
660       FileUtils::AutoGzIfstream fin;
661       fin.openOrExit(bedFiles[i], std::ios::in | std::ios::binary);
662       uchar header[3];
663       fin.read((char *) header, 3);
664       if (!fin || header[0] != 0x6c || header[1] != 0x1b || header[2] != 0x01) {
665 	cerr << "ERROR: Incorrect first three bytes of bed file: " << bedFiles[i] << endl;
666 	exit(1);
667       }
668 
669       // read genotypes
670       uchar *genoLine = ALIGNED_MALLOC_UCHARS(N);
671       uchar *bedLineIn = ALIGNED_MALLOC_UCHARS((Nbed+3)>>2);
672       int numSnpsFailedQC = 0;
673       for (uint64 mfile = 0; mfile < Mfiles[i]; mfile++, mbed++) {
674 	readBedLine(genoLine, bedLineIn, fin, bedSnpToGrmIndex[mbed] != -2);
675 	if (bedSnpToGrmIndex[mbed] != -2) { // not excluded
676 	  double snpMissing = computeSnpMissing(genoLine, maskIndivs);
677 	  bool snpPassQC = snpMissing <= maxMissingPerSnp;
678 	  if (snpPassQC) {
679 	    if (bedSnpToGrmIndex[mbed] >= 0) { // use in GRM
680 	      storeBedLine(bedLineOut, genoLine);
681 	      bedLineOut += Nstride>>2;
682 	      bedSnpToGrmIndex[mbed] = snps.size(); // reassign to final value
683 	      snps.push_back(bedSnps[mbed]);
684 	      snps.back().MAF = computeMAF(genoLine, maskIndivs);
685 	      // update indiv QC info
686 	      for (uint64 n = 0; n < N; n++)
687 		if (genoLine[n] == 9)
688 		  numMissingPerIndiv[n]++;
689 	    }
690 	    // if bedSnpToGrmIndex[mbed] == -1 (don't use in GRM), leave as-is
691 	  }
692 	  else {
693 	    bedSnpToGrmIndex[mbed] = -2; // exclude
694 	    if (numSnpsFailedQC < 5)
695 	      cout << "Filtering snp " << bedSnps[mbed].ID << ": "
696 		   << snpMissing << " missing" << endl;
697 	    numSnpsFailedQC++;
698 	  }
699 	}
700       }
701       ALIGNED_FREE(bedLineIn);
702       ALIGNED_FREE(genoLine);
703 
704       if (numSnpsFailedQC)
705 	cout << "Filtered " << numSnpsFailedQC << " SNPs with > " << maxMissingPerSnp << " missing"
706 	     << endl;
707 
708       if (!fin || fin.get() != EOF) {
709 	cerr << "ERROR: Wrong file size or reading error for bed file: "
710 	     << bedFiles[i] << endl;
711 	exit(1);
712       }
713       fin.close();
714     }
715 
716     M = snps.size();
717 
718     // allocate and initialize maskSnps to all good
719     maskSnps = ALIGNED_MALLOC_UCHARS(M);
720     memset(maskSnps, 1, M*sizeof(maskSnps[0]));
721 
722     // QC indivs for missingness
723     int numIndivsFailedQC = 0;
724     for (uint64 n = 0; n < N; n++)
725       if (maskIndivs[n] && numMissingPerIndiv[n] > maxMissingPerIndiv * M) {
726 	maskIndivs[n] = 0;
727 	numIndivsQC--;
728 	if (numIndivsFailedQC < 5)
729 	  cout << "Filtering indiv " << indivs[n].famID << " " << indivs[n].indivID << ": "
730 	       << numMissingPerIndiv[n] << "/" << M << " missing" << endl;
731 	numIndivsFailedQC++;
732       }
733     if (numIndivsFailedQC)
734       cout << "Filtered " << numIndivsFailedQC << " indivs with > " << maxMissingPerIndiv
735 	   << " missing" << endl;
736 
737     cout << "Total indivs after QC: " << numIndivsQC << endl;
738     cout << "Total post-QC SNPs: M = " << M << endl;
739     //std::map <int, int> vcSnpCounts;
740     vector <int> vcSnpCounts(getNumVCs()+1);
741     for (uint64 m = 0; m < M; m++)
742       if (snps[m].vcNum >= 1)
743 	vcSnpCounts[snps[m].vcNum]++;
744     for (uint v = 1; v < vcSnpCounts.size(); v++) {
745       cout << "  Variance component " << v << ": " << vcSnpCounts[v] << " post-QC SNPs (name: '"
746 	   << vcNames[v] << "')" << endl;
747       if (vcSnpCounts[v] == 0) {
748 	cerr << "  ERROR: No post-QC SNPs left in component \"" << vcNames[v] << "\"" << endl;
749 	cerr << "         Remove this component from --modelSnps to perform analysis" << endl;
750 	cerr << "         Also remove from --remlGuessStr if providing variance estimates" << endl;
751 	exit(1);
752       }
753     }
754   }
755 
~SnpData()756   SnpData::~SnpData() {
757     if (genotypes!=NULL) ALIGNED_FREE(genotypes);
758     ALIGNED_FREE(maskSnps);
759     ALIGNED_FREE(maskIndivs);
760   }
761 
freeGenotypes()762   void SnpData::freeGenotypes() {
763     ALIGNED_FREE(genotypes);
764     genotypes = NULL;
765   }
766 
readBimFile(const string & bimFile,int Nauto)767   vector <SnpInfo> SnpData::readBimFile(const string &bimFile, int Nauto) {
768     vector <SnpInfo> ret;
769     string line;
770     FileUtils::AutoGzIfstream fin; fin.openOrExit(bimFile);
771     int numOutOfOrder = 0;
772     while (getline(fin, line)) {
773       std::istringstream iss(line);
774       SnpInfo snp; string chrom_str;
775       if (!(iss >> chrom_str >> snp.ID >> snp.genpos >> snp.physpos >> snp.allele1 >> snp.allele2))
776 	{
777 	cerr << "ERROR: Incorrectly formatted bim file: " << bimFile << endl;
778 	cerr << "Line " << ret.size()+1 << ":" << endl;
779 	cerr << line << endl;
780 	cerr << "Unable to input 6 values (2 string, 1 double, 1 int, 2 string)" << endl;
781 	exit(1);
782       }
783       snp.chrom = chrStrToInt(chrom_str, Nauto);
784       if (snp.chrom == -1) {
785 	cerr << "ERROR: Unknown chromosome code in bim file: " << bimFile << endl;
786 	cerr << "Line " << ret.size()+1 << ":" << endl;
787 	cerr << line << endl;
788 	exit(1);
789       }
790       if (!ret.empty() &&
791 	  (snp.chrom < ret.back().chrom ||
792 	   (snp.chrom == ret.back().chrom && (snp.physpos <= ret.back().physpos ||
793 					       snp.genpos < ret.back().genpos)))) {
794 	if (numOutOfOrder < 5) {
795 	  cerr << "WARNING: Out-of-order snp in bim file: " << bimFile << endl;
796 	  cerr << "Line " << ret.size()+1 << ":" << endl;
797 	  cerr << line << endl;
798 	}
799 	numOutOfOrder++;
800 	//exit(1);
801       }
802       ret.push_back(snp);
803     }
804     if (numOutOfOrder)
805       cerr << "WARNING: Total number of out-of-order snps in bim file: " << numOutOfOrder << endl;
806     fin.close();
807     return ret;
808   }
809 
getM(void) const810   uint64 SnpData::getM(void) const { return M; }
811   // don't provide getN: don't want the rest of the program to even know N!
getNstride(void) const812   uint64 SnpData::getNstride(void) const { return Nstride; }
getNbed(void) const813   uint64 SnpData::getNbed(void) const { return Nbed; }
getNumIndivsQC(void) const814   uint64 SnpData::getNumIndivsQC(void) const { return numIndivsQC; }
getMaskIndivs(void) const815   const double* SnpData::getMaskIndivs(void) const { return maskIndivs; }
writeMaskIndivs(double out[]) const816   void SnpData::writeMaskIndivs(double out[]) const {
817     memcpy(out, maskIndivs, Nstride*sizeof(maskIndivs[0]));
818   }
writeMaskSnps(uchar out[]) const819   void SnpData::writeMaskSnps(uchar out[]) const { memcpy(out, maskSnps, M*sizeof(maskSnps[0])); }
getSnpInfo(void) const820   const vector <SnpInfo> &SnpData::getSnpInfo(void) const { return snps; }
getBedSnpToGrmIndex(void) const821   const vector <int> &SnpData::getBedSnpToGrmIndex(void) const { return bedSnpToGrmIndex; }
getFamPhenos(void) const822   vector <double> SnpData::getFamPhenos(void) const {
823     vector <double> phenos(N);
824     for (uint64 n = 0; n < N; n++) phenos[n] = indivs[n].pheno;
825     return phenos;
826   }
getMapAvailable(void) const827   bool SnpData::getMapAvailable(void) const { return mapAvailable; }
getGenotypes(void) const828   const uchar* SnpData::getGenotypes(void) const { return genotypes; }
getNumVCs(void) const829   int SnpData::getNumVCs(void) const { return vcNames.size()-1; }
getVCnames(void) const830   std::vector <string> SnpData::getVCnames(void) const { return vcNames; }
831 
832   /*
833   vector <double> getMAFs(void) const {
834     vector <double> MAFs(M);
835     for (uint64 m = 0; m < M; m++) MAFs[m] = snps[m].MAF;;
836     return MAFs;
837   }
838   */
getIndivInd(string & FID,string & IID) const839   uint64 SnpData::getIndivInd(string &FID, string &IID) const {
840     std::map <string, uint64>::const_iterator it = FID_IID_to_ind.find(FID+" "+IID);
841     if (it != FID_IID_to_ind.end())
842       return it->second;
843     else
844       return IND_MISSING;
845   }
846 
847   // check same chrom and within EITHER gen or phys distance
isProximal(uint64 m1,uint64 m2,double genWindow,int physWindow) const848   bool SnpData::isProximal(uint64 m1, uint64 m2, double genWindow, int physWindow) const {
849     if (snps[m1].chrom != snps[m2].chrom) return false;
850     return (mapAvailable && fabs(snps[m1].genpos - snps[m2].genpos) < genWindow) ||
851       abs(snps[m1].physpos - snps[m2].physpos) < physWindow;
852   }
853 
854   // note: pheno may have size Nstride, but we only output N values
writeFam(const string & outFile,const vector<double> & pheno) const855   void SnpData::writeFam(const string &outFile, const vector <double> &pheno) const {
856     FileUtils::AutoGzOfstream fout; fout.openOrExit(outFile);
857     for (uint64 n = 0; n < N; n++)
858       fout << indivs[n].famID << "\t" << indivs[n].indivID << "\t" << indivs[n].paternalID << "\t"
859 	   << indivs[n].maternalID << "\t" << indivs[n].sex << "\t"
860 	   << std::setprecision(10) << pheno[n] << endl;
861     fout.close();
862   }
863 
864   /*
865   // creates map <string, double> and goes through snpInfo
866   // plain text, 2-col: rs, LDscore; # to ignore
867   void loadLDscore(const string &file) {
868     // clear existing LDscore (set to NaN/-1?)
869   }
870 
871   // use LDscore regression to calibrate stats[]
872   void calibrateStats(const double stats[]) const {
873 
874   }
875   */
876 
877   // writes Nstride values to out[]
878   // assumes subMaskIndivs is a sub-mask of maskIndivs
879   //   (presumably obtained by using writeMaskIndivs and taking a subset)
880   // work: 256x4 aligned work array
buildMaskedSnpVector(double out[],const double subMaskIndivs[],uint64 m,const double lut0129[4],double (* work)[4]) const881   void SnpData::buildMaskedSnpVector(double out[], const double subMaskIndivs[], uint64 m,
882 				     const double lut0129[4], double (*work)[4]) const {
883 
884     /* lookupBedCode[4] = { value of 2 effect alleles (bed 00 = 2 effect alleles),
885                             value of missing          (bed 01 = missing),
886 			    value of 1 effect allele  (bed 10 = 1 effect allele),
887 			    value of 0 effect alleles (bed 11 = 0 effect alleles) } */
888     double lookupBedCode[4] = {lut0129[2], lut0129[3], lut0129[1], lut0129[0]};
889     buildByteLookup(work, lookupBedCode);
890 
891     uchar *ptr = genotypes + m * (Nstride>>2);
892     for (uint64 n4 = 0; n4 < Nstride; n4 += 4) {
893 #ifdef USE_SSE // todo: add AVX instructions to do all at once?
894       __m128d x01 = _mm_load_pd(&work[*ptr][0]);
895       __m128d x23 = _mm_load_pd(&work[*ptr][2]);
896       __m128d mask01 = _mm_load_pd(&subMaskIndivs[n4]);
897       __m128d mask23 = _mm_load_pd(&subMaskIndivs[n4+2]);
898       _mm_store_pd(&out[n4], _mm_mul_pd(x01, mask01));
899       _mm_store_pd(&out[n4+2], _mm_mul_pd(x23, mask23));
900 #else
901       // non-lookup approach
902       /*
903       uchar g = *ptr;
904       out[n4] = lookupBedCode[g&3] * subMaskIndivs[n4];
905       out[n4+1] = lookupBedCode[(g>>2)&3] * subMaskIndivs[n4+1];
906       out[n4+2] = lookupBedCode[(g>>4)&3] * subMaskIndivs[n4+2];
907       out[n4+3] = lookupBedCode[(g>>6)&3] * subMaskIndivs[n4+3];
908       */
909       memcpy(out + n4, work[*ptr], sizeof(work[0]));
910       for (int k = 0; k < 4; k++)
911 	out[n4+k] *= subMaskIndivs[n4+k];
912 #endif
913       ptr++;
914     }
915   }
916 
917   /**
918    * performs fast rough computation of LD Scores among chip snps to use in regression weights
919    * approximations:
920    * - only uses a subsample of individuals
921    * - replaces missing genotypes with means and uses them in dot product
922    * - computes adjusted r^2 using 1/n correction instead of baseline
923    */
estChipLDscores(uint64 sampleSize) const924   vector <double> SnpData::estChipLDscores(uint64 sampleSize) const {
925     if (sampleSize > numIndivsQC) {
926       cerr << "WARNING: Only " << numIndivsQC << " indivs available; using all" << endl;
927       sampleSize = numIndivsQC;
928     }
929     if (sampleSize & 7) {
930       sampleSize &= ~(uint64) 7;
931       cout << "Reducing sample size to " << sampleSize << " for memory alignment" << endl;
932     }
933 
934     // choose sampleSize indivs from maskIndivs
935     uint64 step = numIndivsQC / sampleSize;
936     vector <int> indivInds;
937     for (uint64 n = 0, nGood = 0; n < Nstride; n++)
938       if (maskIndivs[n]) {
939 	if (nGood % step == 0) {
940 	  indivInds.push_back(n);
941 	  if (indivInds.size() == sampleSize)
942 	    break;
943 	}
944 	nGood++;
945       }
946 
947     vector <double> chipLDscores(M, NAN);
948     for (int chrom = 1; chrom <= Nautosomes+1; chrom++)
949       estChipLDscoresChrom(chipLDscores, chrom, indivInds);
950     return chipLDscores;
951   }
952 
953 
954 
955 
956   // note n <= N because of potential missing data/masking
compute_r2(const int x[],const int y[],int dim) const957   double SnpData::compute_r2(const int x[], const int y[], int dim) const {
958     int n = 0;
959     double avg_x = 0, avg_y = 0;
960     for (int i = 0; i < dim; i++)
961       if (x[i] != 9 && y[i] != 9) {
962 	avg_x += x[i];
963 	avg_y += y[i];
964 	n++;
965       }
966     avg_x /= n; avg_y /= n;
967     double sum_xx = 0, sum_xy = 0, sum_yy = 0;
968     for (int i = 0; i < dim; i++)
969       if (x[i] != 9 && y[i] != 9) {
970 	double xi = x[i]-avg_x;
971 	double yi = y[i]-avg_y;
972 	sum_xx += xi*xi;
973 	sum_xy += xi*yi;
974 	sum_yy += yi*yi;
975       }
976     double r2 = sum_xy*sum_xy / (sum_xx*sum_yy);
977     return r2;
978     //return r2 - (1-r2)/(n-2); adjusted r2
979   }
980 
981   // fill Nstride-element array with 0, 1, 2, 9; missing or mask => 9
fillSnpRow(int x[],uint64 m) const982   void SnpData::fillSnpRow(int x[], uint64 m) const {
983     /* lookupBedCode[4] = {value of 0, value of missing, value of 1, value of 2} */
984     const int lookupBedCode[4] = {0, 9, 1, 2};
985     for (uint64 n4 = 0; n4 < Nstride; n4 += 4) {
986       uchar byte = genotypes[m * (Nstride>>2) + (n4>>2)];
987       for (int k = 0; k < 4; k++)
988 	x[n4+k] = maskIndivs[n4+k] ? lookupBedCode[(byte>>(k+k))&3] : 9;
989     }
990   }
991 
augmentLDscores(uint64 m,const vector<int> & mRow,uint64 mp,const vector<int> & mpRow,const vector<std::pair<double,int>> & windows,const vector<double> & alphaMAFdeps,vector<double> & LDscores,vector<double> & windowCounts) const992   bool SnpData::augmentLDscores(uint64 m, const vector <int> &mRow, uint64 mp, const vector <int> &mpRow,
993 		       const vector < std::pair <double, int> > &windows,
994 		       const vector <double> &alphaMAFdeps, vector <double> &LDscores,
995 		       vector <double> &windowCounts) const {
996 
997     int W = windows.size();
998     int A = alphaMAFdeps.size();
999     double r2 = compute_r2(&mRow[0], &mpRow[0], Nstride);
1000     bool foundProximal = false;
1001     for (int w = 0; w < W; w++)
1002       if (isProximal(m, mp, windows[w].first, windows[w].second)) {
1003 	foundProximal = true;
1004 	if (!std::isnan(r2)) {
1005 	  for (int a = 0; a < A; a++) {
1006 	    double weight = pow((snps[mp].MAF * (1-snps[mp].MAF)), alphaMAFdeps[a]);
1007 	    LDscores[w*A+a] += weight * r2;
1008 	    windowCounts[w*A+a] += weight;
1009 	  }
1010 	}
1011       }
1012     return foundProximal;
1013   }
1014 
computeLDscore(uint64 m,const vector<std::pair<double,int>> & windows,vector<double> & windowCounts,double & baselineR2,const RestrictSnpSet & restrictPartnerSet,const vector<double> & alphaMAFdeps) const1015   vector <double> SnpData::computeLDscore(uint64 m, const vector < std::pair <double, int> > &windows,
1016 				 vector <double> &windowCounts, double &baselineR2,
1017 				 const RestrictSnpSet &restrictPartnerSet,
1018 				 const vector <double> &alphaMAFdeps) const {
1019 
1020     int W = windows.size();
1021     int A = alphaMAFdeps.size();
1022     vector <double> LDscores(W*A);
1023     windowCounts = vector <double> (W*A);
1024 
1025     vector <int> mRow(Nstride), mpRow(Nstride);
1026     fillSnpRow(&mRow[0], m);
1027 
1028     int winSnps1 = 0, winSnps2 = 0;
1029     for (int mp = (int) m; mp >= 0; mp--) {
1030       if (snps[m].chrom != snps[mp].chrom) break;
1031       if (!restrictPartnerSet.isAllowed(snps[mp])) continue;
1032       fillSnpRow(&mpRow[0], mp);
1033       if (!augmentLDscores(m, mRow, mp, mpRow, windows, alphaMAFdeps, LDscores, windowCounts))
1034 	break;
1035       winSnps1++;
1036     }
1037     for (int mp = m+1; mp < (int) M; mp++) {
1038       if (snps[m].chrom != snps[mp].chrom) break;
1039       if (!restrictPartnerSet.isAllowed(snps[mp])) continue;
1040       fillSnpRow(&mpRow[0], mp);
1041       if (!augmentLDscores(m, mRow, mp, mpRow, windows, alphaMAFdeps, LDscores, windowCounts))
1042 	break;
1043       winSnps2++;
1044     }
1045 
1046     // compute baseline average LD to random off-chrom SNPs
1047     //double maxWindowCount = *std::max_element(windowCounts.begin(), windowCounts.end());
1048     //uint64 mStep = M / (int) maxWindowCount; // max will be for alpha=0 (actual integer count)
1049     uint64 mStep = M / (winSnps1 + winSnps2);
1050     double totOffChrom_r2s = 0;
1051     int numOffChrom_r2s = 0;
1052     for (uint64 mp = m % mStep; mp < M; mp += mStep)
1053       if (snps[mp].chrom != snps[m].chrom) {
1054 	fillSnpRow(&mpRow[0], mp);
1055 	double r2 = compute_r2(&mRow[0], &mpRow[0], Nstride);
1056 	if (!std::isnan(r2)) {
1057 	  totOffChrom_r2s += r2;
1058 	  numOffChrom_r2s++;
1059 	}
1060       }
1061     baselineR2 = totOffChrom_r2s / numOffChrom_r2s;
1062     for (int w = 0; w < W; w++)
1063       for (int a = 0; a < A; a++)
1064 	LDscores[w*A+a] = (1+baselineR2) * LDscores[w*A+a] - windowCounts[w*A+a] * baselineR2;
1065     return LDscores;
1066   }
1067 
1068 };
1069