1 /* 2 This file is part of the BOLT-LMM linear mixed model software package 3 developed by Po-Ru Loh. Copyright (C) 2014-2019 Harvard University. 4 5 This program is free software: you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation, either version 3 of the License, or 8 (at your option) any later version. 9 10 This program is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 You should have received a copy of the GNU General Public License 16 along with this program. If not, see <http://www.gnu.org/licenses/>. 17 */ 18 19 #include <cstring> 20 #include <cmath> 21 #include <cassert> 22 #include <iomanip> 23 #include <iostream> 24 #include <fstream> 25 #include <sstream> 26 #include <map> 27 28 #ifdef USE_SSE 29 #include <emmintrin.h> // SSE2 for packed doubles 30 #endif 31 32 #include "Types.hpp" 33 #include "SnpData.hpp" 34 #include "SnpInfo.hpp" 35 #include "RestrictSnpSet.hpp" 36 #include "FileUtils.hpp" 37 #include "MemoryUtils.hpp" 38 #include "MapInterpolater.hpp" 39 #include "LapackConst.hpp" 40 41 namespace LMM { 42 43 using std::vector; 44 using std::string; 45 using std::cout; 46 using std::cerr; 47 using std::endl; 48 using FileUtils::getline; 49 50 const uint64 SnpData::IND_MISSING = (uint64) -1; 51 chrStrToInt(const string & chrom,int Nauto)52 int SnpData::chrStrToInt(const string &chrom, int Nauto) { 53 if (isdigit(chrom[0])) { 54 int chr = atoi(chrom.c_str()); 55 if (chr>=1 && chr<=Nauto+1) return chr; 56 return -1; 57 } 58 if (chrom == "X" || chrom == "XY" || chrom == "PAR1" || chrom == "PAR2") return Nauto+1; 59 return -1; 60 } 61 62 /** 63 * work: 256x4 aligned work array 64 * lookupBedCode[4] = {value of 0, value of missing, value of 1, value of 2} 65 */ buildByteLookup(double (* work)[4],const double lookupBedCode[4]) const66 void SnpData::buildByteLookup(double (*work)[4], const double lookupBedCode[4]) const { 67 for (int byte4 = 0; byte4 < 256; byte4 += 4) { 68 for (int k = 0; k < 4; k++) // fill 4 values for first of 4 consecutive bytes 69 work[byte4][k] = lookupBedCode[(byte4>>(k+k))&3]; 70 for (int k = 1; k < 4; k++) { 71 memcpy(work[byte4+k], work[byte4], sizeof(work[0])); 72 work[byte4+k][0] = lookupBedCode[k]; 73 } 74 } 75 /* slow? way 76 for (int byte = 0; byte < 256; byte++) { 77 for (int k = 0; k < 4; k++) 78 work[byte][k] = lookupBedCode[(byte>>(k+k))&3]; 79 } 80 */ 81 } 82 estChipLDscoresChrom(vector<double> & chipLDscores,int chrom,const vector<int> & indivInds) const83 void SnpData::estChipLDscoresChrom(vector <double> &chipLDscores, int chrom, 84 const vector <int> &indivInds) const { 85 uint64 chrStart = 0; 86 while (chrStart < M && snps[chrStart].chrom != chrom) chrStart++; 87 uint64 chrEnd = chrStart; 88 while (chrEnd < M && snps[chrEnd].chrom == chrom) chrEnd++; 89 uint64 Mchr = chrEnd - chrStart; 90 if (Mchr == 0) return; 91 uint64 Nsub = indivInds.size(); 92 93 //cout << "Estimating chip LD Scores on chromosome " << chrom << endl; 94 for (uint64 m = chrStart; m < chrEnd; m++) chipLDscores[m] = 1.0; 95 96 // allocate memory 97 uchar *chrMaskSnps = ALIGNED_MALLOC_UCHARS(Mchr); 98 memset(chrMaskSnps, 0, Mchr * sizeof(chrMaskSnps[0])); 99 float *chrNormalizedGenos = ALIGNED_MALLOC_FLOATS(Mchr * Nsub); 100 memset(chrNormalizedGenos, 0, Mchr * Nsub * sizeof(chrNormalizedGenos[0])); 101 const int mBlock = 64; 102 float *dotProds = ALIGNED_MALLOC_FLOATS(Mchr * mBlock); 103 104 // fill and normalize genotypes 105 for (uint64 mchr = 0; mchr < Mchr; mchr++) { 106 uint64 m = chrStart + mchr; 107 if (maskSnps[m]) 108 chrMaskSnps[mchr] = fillSnpSubRowNorm1(chrNormalizedGenos + mchr*Nsub, m, indivInds); 109 } 110 111 uint64 mchrWindowStart = 0; 112 for (uint64 mchr0 = 0; mchr0 < Mchr; mchr0 += mBlock) { // sgemm to compute r2s 113 uint64 mBlockCrop = std::min(Mchr, mchr0+mBlock) - mchr0; 114 while (!isProximal(chrStart + mchrWindowStart, chrStart + mchr0, 0.01, 1000000)) 115 mchrWindowStart++; 116 uint64 prevWindowSize = mchr0+mBlockCrop-1 - mchrWindowStart; 117 118 // [mchrWindowStart..mchr0+mBlockCrop-1) x [mchr0..mchr0+mBlockCrop) 119 { 120 char TRANSA_ = 'T'; 121 char TRANSB_ = 'N'; 122 int M_ = prevWindowSize; 123 int N_ = mBlockCrop; 124 int K_ = Nsub; 125 float ALPHA_ = 1; 126 float *A_ = chrNormalizedGenos + mchrWindowStart*Nsub; 127 int LDA_ = Nsub; 128 float *B_ = chrNormalizedGenos + mchr0*Nsub; 129 int LDB_ = Nsub; 130 float BETA_ = 0; 131 float *C_ = dotProds; 132 int LDC_ = prevWindowSize; 133 SGEMM_MACRO(&TRANSA_, &TRANSB_, &M_, &N_, &K_, &ALPHA_, A_, &LDA_, B_, &LDB_, 134 &BETA_, C_, &LDC_); 135 } 136 137 for (uint64 mPlus = 0; mPlus < mBlockCrop; mPlus++) { 138 uint64 m = chrStart + mchr0 + mPlus; 139 if (!chrMaskSnps[m-chrStart]) continue; 140 for (uint64 mPlus2 = 0; mchrWindowStart+mPlus2 < mchr0+mPlus; mPlus2++) { 141 uint64 m2 = chrStart + mchrWindowStart + mPlus2; 142 if (!chrMaskSnps[m2-chrStart]) continue; 143 float adjR2 = dotProdToAdjR2(dotProds[mPlus2 + mPlus*prevWindowSize], Nsub); 144 chipLDscores[m] += adjR2; 145 chipLDscores[m2] += adjR2; 146 } 147 } 148 } 149 150 ALIGNED_FREE(dotProds); 151 ALIGNED_FREE(chrNormalizedGenos); 152 ALIGNED_FREE(chrMaskSnps); 153 } 154 dotProdToAdjR2(float dotProd,int n)155 float SnpData::dotProdToAdjR2(float dotProd, int n) { 156 float r2 = dotProd*dotProd; 157 return r2 - (1-r2)/(n-2); 158 } 159 160 /** 161 * fills x[] with indivInds.size() elements corresponding to chosen subset of indivInds 162 * replaces missing values with average; mean-centers and normalizes vector length to 1 163 * if monomorphic among non-missing, fills with all-0s 164 * 165 * return: true if snp is polymorphic in indivInds; false if not 166 */ fillSnpSubRowNorm1(float x[],uint64 m,const vector<int> & indivInds) const167 bool SnpData::fillSnpSubRowNorm1(float x[], uint64 m, const vector <int> &indivInds) const { 168 /* lookupBedCode[4] = {value of 0, value of missing, value of 1, value of 2} */ 169 const float lookupBedCode[4] = {0.0f, 9.0f, 1.0f, 2.0f}; 170 float sumPresent = 0; int numPresent = 0; 171 for (uint64 i = 0; i < indivInds.size(); i++) { 172 uint64 n = indivInds[i]; 173 uchar byte = genotypes[m * (Nstride>>2) + (n>>2)]; 174 int k = n&3; 175 x[i] = lookupBedCode[(byte>>(k+k))&3]; 176 if (x[i] != 9.0f) { 177 sumPresent += x[i]; 178 numPresent++; 179 } 180 } 181 float avg = sumPresent / numPresent; 182 float sum2 = 0; 183 for (uint64 i = 0; i < indivInds.size(); i++) { 184 if (x[i] != 9.0f) { // non-missing; mean-center 185 x[i] -= avg; 186 sum2 += x[i]*x[i]; 187 } 188 else // missing; replace with mean (centered to 0) 189 x[i] = 0; 190 } 191 if (sum2 < 0.001) { // monomorphic among non-missing 192 for (uint64 i = 0; i < indivInds.size(); i++) x[i] = 0; // set to 0 193 return false; 194 } 195 else { // polymorphic 196 float invNorm = 1.0f / sqrtf(sum2); 197 for (uint64 i = 0; i < indivInds.size(); i++) x[i] *= invNorm; // normalize to vector len 1 198 return true; 199 } 200 } 201 processIndivs(const string & famFile,const vector<string> & removeFiles)202 void SnpData::processIndivs(const string &famFile, const vector <string> &removeFiles) { 203 string line; 204 205 vector <IndivInfo> bedIndivs; 206 FileUtils::AutoGzIfstream fin; fin.openOrExit(famFile); 207 while (getline(fin, line)) { 208 std::istringstream iss(line); 209 IndivInfo indiv; 210 if (!(iss >> indiv.famID >> indiv.indivID >> indiv.paternalID >> indiv.maternalID 211 >> indiv.sex >> indiv.pheno)) { 212 cerr << "ERROR: Incorrectly formatted fam file: " << famFile << endl; 213 cerr << "Line " << bedIndivs.size()+1 << ":" << endl; 214 cerr << line << endl; 215 cerr << "Unable to input 6 values (4 string, 1 int, 1 double)" << endl; 216 exit(1); 217 } 218 string combined_ID = indiv.famID + " " + indiv.indivID; 219 if (FID_IID_to_ind.find(combined_ID) != FID_IID_to_ind.end()) { 220 cerr << "ERROR: Duplicate individual in fam file at line " << bedIndivs.size()+1 << endl; 221 exit(1); 222 } 223 FID_IID_to_ind[combined_ID] = bedIndivs.size(); 224 bedIndivs.push_back(indiv); 225 } 226 fin.close(); 227 Nbed = bedIndivs.size(); 228 229 cout << "Total indivs in PLINK data: Nbed = " << Nbed << endl; 230 231 // process individuals to remove 232 vector <bool> useIndiv(Nbed, true); 233 for (uint f = 0; f < removeFiles.size(); f++) { 234 const string &removeFile = removeFiles[f]; 235 cout << "Reading remove file (indivs to remove): " << removeFile << endl; 236 fin.openOrExit(removeFile); 237 int lineCtr = 0; 238 int numRemoved = 0; 239 int numAbsent = 0; 240 while (getline(fin, line)) { 241 lineCtr++; 242 std::istringstream iss(line); 243 string FID, IID; 244 if (!(iss >> FID >> IID)) { 245 cerr << "ERROR: Incorrectly formatted remove file: " << removeFile << endl; 246 cerr << "Line " << lineCtr << ":" << endl; 247 cerr << line << endl; 248 cerr << "Unable to input FID and IID" << endl; 249 exit(1); 250 } 251 string combined_ID = FID + " " + IID; 252 if (FID_IID_to_ind.find(combined_ID) == FID_IID_to_ind.end()) { 253 if (numAbsent < 5) 254 cerr << "WARNING: Unable to find individual to remove: " << combined_ID << endl; 255 numAbsent++; 256 } 257 else if (useIndiv[FID_IID_to_ind[combined_ID]]) { 258 useIndiv[FID_IID_to_ind[combined_ID]] = false; 259 numRemoved++; 260 } 261 } 262 fin.close(); 263 cout << "Removed " << numRemoved << " individual(s)" << endl; 264 if (numAbsent) 265 cerr << "WARNING: " << numAbsent << " individual(s) not found in data set" << endl; 266 } 267 268 // determine number of indivs remaining post-removal and set up indices 269 bedIndivToRemoveIndex.resize(Nbed); 270 FID_IID_to_ind.clear(); // redo FID_IID -> indiv index 271 for (uint64 nbed = 0; nbed < Nbed; nbed++) { 272 if (useIndiv[nbed]) { 273 bedIndivToRemoveIndex[nbed] = indivs.size(); 274 FID_IID_to_ind[bedIndivs[nbed].famID + " " + bedIndivs[nbed].indivID] = indivs.size(); 275 indivs.push_back(bedIndivs[nbed]); 276 } 277 else 278 bedIndivToRemoveIndex[nbed] = -1; 279 } 280 N = indivs.size(); 281 cout << "Total indivs stored in memory: N = " << N << endl; 282 283 // allocate and initialize maskIndivs to all good (aside from filler at end) 284 Nstride = (N+3)&~3; 285 maskIndivs = ALIGNED_MALLOC_DOUBLES(Nstride); 286 for (uint64 n = 0; n < N; n++) maskIndivs[n] = 1; 287 for (uint64 n = N; n < Nstride; n++) maskIndivs[n] = 0; 288 } 289 processSnps(vector<uint64> & Mfiles,const vector<string> & bimFiles,const vector<string> & excludeFiles,const vector<string> & modelSnpsFiles,const vector<string> & vcNamesIn,bool loadNonModelSnps)290 vector <SnpInfo> SnpData::processSnps(vector <uint64> &Mfiles, const vector <string> &bimFiles, 291 const vector <string> &excludeFiles, 292 const vector <string> &modelSnpsFiles, 293 const vector <string> &vcNamesIn, bool loadNonModelSnps) { 294 FileUtils::AutoGzIfstream fin; 295 string line; 296 297 vector <SnpInfo> bedSnps; 298 // read bim files 299 for (uint i = 0; i < bimFiles.size(); i++) { 300 cout << "Reading bim file #" << (i+1) << ": " << bimFiles[i] << endl; 301 vector <SnpInfo> snps_i = readBimFile(bimFiles[i], Nautosomes); 302 bedSnps.insert(bedSnps.end(), snps_i.begin(), snps_i.end()); 303 Mfiles.push_back(snps_i.size()); 304 cout << " Read " << Mfiles.back() << " snps" << endl; 305 } 306 Mbed = bedSnps.size(); 307 308 cout << "Total snps in PLINK data: Mbed = " << Mbed << endl; 309 310 // >=1 = in GRM (model), 0 = not in GRM, -1 = exclude 311 const int excludeVCnum = -1; 312 const int nonGrmVCnum = 0; 313 const int firstVCnum = 1; 314 // if list of GRM snps, default = no 315 const int defaultVCnum = modelSnpsFiles.empty() ? firstVCnum : nonGrmVCnum; 316 vector <int> snpVCnum(Mbed, (char) defaultVCnum); 317 if (vcNamesIn.empty()) { 318 vcNames = vector <string> (1, "env/noise"); // 0th entry of vcNames is ignored; VCs 1-indexed 319 if (modelSnpsFiles.empty()) // if modelSnpsFiles are given, vcNames will be populated later 320 vcNames.push_back("modelSnps"); // if not, put all non-excluded SNPs in one variance comp 321 } 322 else 323 vcNames = vcNamesIn; 324 325 // check for duplicate snps 326 { 327 std::set <string> rsIDs; 328 for (uint64 mbed = 0; mbed < Mbed; mbed++) { 329 if (rsIDs.find(bedSnps[mbed].ID) != rsIDs.end()) { 330 cerr << "WARNING: Duplicate snp ID " << bedSnps[mbed].ID 331 << " -- masking duplicate" << endl; 332 snpVCnum[mbed] = excludeVCnum; 333 } 334 else 335 rsIDs.insert(bedSnps[mbed].ID); 336 } 337 } 338 339 // exclude snps if list provided; restrict GRM snps if list provided 340 if (!excludeFiles.empty() || !modelSnpsFiles.empty()) { 341 342 // create dictionary rsID -> index in full bed snp list 343 std::map <string, uint64> rsID_to_ind; 344 for (uint64 mbed = 0; mbed < Mbed; mbed++) 345 if (rsID_to_ind.find(bedSnps[mbed].ID) == rsID_to_ind.end()) // only use first of dupe IDs 346 rsID_to_ind[bedSnps[mbed].ID] = mbed; 347 348 // exclude snps 349 for (uint f = 0; f < excludeFiles.size(); f++) { 350 const string &excludeFile = excludeFiles[f]; 351 cout << "Reading exclude file (SNPs to exclude): " << excludeFile << endl; 352 fin.openOrExit(excludeFile); 353 int numExcluded = 0; 354 int numAbsent = 0; 355 while (getline(fin, line)) { 356 std::istringstream iss(line); 357 string rsID; iss >> rsID; 358 if (rsID_to_ind.find(rsID) == rsID_to_ind.end()) { 359 if (numAbsent < 5) 360 cerr << "WARNING: Unable to find SNP to exclude: " << rsID << endl; 361 numAbsent++; 362 } 363 else if (snpVCnum[rsID_to_ind[rsID]] != excludeVCnum) { 364 snpVCnum[rsID_to_ind[rsID]] = excludeVCnum; 365 numExcluded++; 366 } 367 } 368 fin.close(); 369 cout << "Excluded " << numExcluded << " SNP(s)" << endl; 370 if (numAbsent) 371 cerr << "WARNING: " << numAbsent << " SNP(s) not found in data set" << endl; 372 } 373 374 // include GRM snps listed 375 std::map <string, int> vcNamesToInds; 376 if (!vcNamesIn.empty()) // initialize map 377 for (int v = 1; v < (int) vcNamesIn.size(); v++) 378 vcNamesToInds[vcNamesIn[v]] = v; 379 380 for (uint f = 0; f < modelSnpsFiles.size(); f++) { 381 const string &modelSnpsFile = modelSnpsFiles[f]; 382 cout << "Reading list of SNPs to include in model (i.e., GRM): " << modelSnpsFile << endl; 383 fin.openOrExit(modelSnpsFile); 384 int numIncluded = 0; 385 int numAlreadyExcluded = 0; 386 int numAlreadyAssigned = 0; 387 int numAbsent = 0; 388 while (getline(fin, line)) { 389 std::istringstream iss(line); 390 string rsID; iss >> rsID; 391 if (rsID_to_ind.find(rsID) == rsID_to_ind.end()) { 392 if (numAbsent < 5) 393 cerr << "WARNING: Unable to find SNP to include in model: " << rsID << endl; 394 numAbsent++; 395 } 396 else if (snpVCnum[rsID_to_ind[rsID]] == excludeVCnum) { 397 if (numAlreadyExcluded < 5) 398 cerr << "WARNING: SNP has been excluded: " << rsID << endl; 399 numAlreadyExcluded++; 400 } 401 else if (snpVCnum[rsID_to_ind[rsID]] == nonGrmVCnum) { 402 string vcName; iss >> vcName; // ok if all lines have no other fields; then vcName="" 403 if (vcName.empty()) vcName = "modelSnps"; 404 if (vcNamesToInds.find(vcName) == vcNamesToInds.end()) { 405 if (vcNamesIn.empty()) { 406 vcNamesToInds[vcName] = vcNames.size(); 407 vcNames.push_back(vcName); 408 } 409 else { 410 cerr << "ERROR: SNP " << rsID << " is assigned to VC '" << vcName 411 << "' not in --remlGuessStr" << endl; 412 exit(1); 413 } 414 } 415 int vcNum = vcNamesToInds[vcName]; 416 if (vcNum > SnpInfo::MAX_VC_NUM) { 417 cerr << "ERROR: Too many distinct variance component names (2nd column); max = " 418 << SnpInfo::MAX_VC_NUM << endl; 419 exit(1); 420 } 421 snpVCnum[rsID_to_ind[rsID]] = vcNum; 422 numIncluded++; 423 } 424 else { 425 if (numAlreadyAssigned < 5) 426 cerr << "WARNING: SNP was already assigned to a variance comp: " << rsID << endl; 427 numAlreadyAssigned++; 428 } 429 } 430 fin.close(); 431 cout << "Included " << numIncluded << " SNP(s) in model in " 432 << vcNames.size()-1 << " variance component(s)"<< endl; 433 if (numAbsent) 434 cerr << "WARNING: " << numAbsent << " SNP(s) not found in data set" << endl; 435 if (numAlreadyExcluded) 436 cerr << "WARNING: " << numAlreadyExcluded << " SNP(s) had been excluded" << endl; 437 if (numAlreadyAssigned) 438 cerr << "WARNING: " << numAlreadyAssigned << " SNP(s) were multiply assigned" << endl; 439 } 440 } 441 442 // determine number of snps remaining post-exclusion and set up index 443 M = 0; // note: M will be updated later after further QC 444 uint64 Mexclude = 0, MnonGRM = 0; 445 bedSnpToGrmIndex.resize(Mbed); // note: this index will be further updated after QC 446 for (uint64 mbed = 0; mbed < Mbed; mbed++) { 447 int vcNum = snpVCnum[mbed]; 448 bedSnps[mbed].vcNum = vcNum; 449 if (vcNum >= firstVCnum) { 450 bedSnpToGrmIndex[mbed] = (int) M; 451 M++; 452 } 453 else if (vcNum == nonGrmVCnum && loadNonModelSnps) { 454 bedSnpToGrmIndex[mbed] = -1; 455 MnonGRM++; 456 } 457 else if (vcNum == nonGrmVCnum || vcNum == excludeVCnum) { 458 bedSnpToGrmIndex[mbed] = -2; 459 Mexclude++; 460 } 461 else assert(false); // shouldn't be any other possibilities 462 } 463 cout << endl; 464 cout << "Breakdown of SNP pre-filtering results:" << endl; 465 cout << " " << M << " SNPs to include in model (i.e., GRM)" << endl; 466 cout << " " << MnonGRM << " additional non-GRM SNPs loaded" << endl; 467 cout << " " << Mexclude << " excluded SNPs" << endl; 468 /* 469 if (M < 10) { 470 cerr << "ERROR: Very few SNPs included in model; probably an input error" << endl; 471 exit(1); 472 } 473 */ 474 return bedSnps; 475 } 476 processMap(vector<SnpInfo> & bedSnps,const string & geneticMapFile,bool noMapCheck)477 void SnpData::processMap(vector <SnpInfo> &bedSnps, const string &geneticMapFile, 478 bool noMapCheck) { 479 // fill in map if external file provided 480 if (!geneticMapFile.empty()) { 481 cout << "Filling in genetic map coordinates using reference file:" << endl; 482 cout << " " << geneticMapFile << endl; 483 MapInterpolater mapInterpolater(geneticMapFile); 484 for (uint64 mbed = 0; mbed < Mbed; mbed++) 485 if (bedSnpToGrmIndex[mbed] != -2) 486 bedSnps[mbed].genpos = 487 mapInterpolater.interp(bedSnps[mbed].chrom, bedSnps[mbed].physpos); 488 } 489 490 // check map and rescale if in cM units: calculate genpos/physpos for last autosomal snp 491 mapAvailable = false; 492 for (int mbedLast = (int) Mbed-1; mbedLast >= 0; mbedLast--) 493 if (bedSnpToGrmIndex[mbedLast] != -2 && bedSnps[mbedLast].chrom <= Nautosomes+1) { 494 double scale = bedSnps[mbedLast].genpos / bedSnps[mbedLast].physpos; 495 if (scale == 0) 496 cerr << "WARNING: No genetic map provided; using physical positions only" << endl; 497 else if (0.5e-6 < scale && scale < 2e-6) { 498 cerr << "WARNING: Genetic map appears to be in cM units; rescaling by 0.01" << endl; 499 for (uint64 mbed = 0; mbed < Mbed; mbed++) 500 bedSnps[mbed].genpos *= 0.01; 501 mapAvailable = true; 502 } 503 else if (0.5e-8 < scale && scale < 2e-8) 504 mapAvailable = true; 505 else { 506 if (noMapCheck) { 507 cerr << "WARNING: Genetic map appears wrong based on last genpos/bp" << endl; 508 cerr << " Proceeding anyway because --noMapCheck is set" << endl; 509 mapAvailable = true; 510 } 511 else { 512 cerr << "ERROR: Genetic map appears wrong based on last genpos/bp" << endl; 513 cerr << " To proceed anyway, set --noMapCheck" << endl; 514 exit(1); 515 } 516 } 517 break; 518 } 519 } 520 storeBedLine(uchar bedLineOut[],const uchar genoLine[])521 void SnpData::storeBedLine(uchar bedLineOut[], const uchar genoLine[]) { 522 const int genoToBed[10] = {3, 2, 0, 0, 0, 0, 0, 0, 0, 1}; 523 memset(bedLineOut, 0, (Nstride>>2) * sizeof(bedLineOut[0])); 524 for (uint64 n = 0; n < N; n++) 525 bedLineOut[n>>2] = (uchar) (bedLineOut[n>>2] | genoToBed[genoLine[n]]<<((n&3)<<1)); 526 } 527 528 /** 529 * assumes Nbed and bedIndivToRemoveIndex have been initialized 530 * if loadGenoLine == false, just advances the file pointer 531 */ readBedLine(uchar genoLine[],uchar bedLineIn[],FileUtils::AutoGzIfstream & fin,bool loadGenoLine) const532 void SnpData::readBedLine(uchar genoLine[], uchar bedLineIn[], FileUtils::AutoGzIfstream &fin, 533 bool loadGenoLine) const { 534 fin.read((char *) bedLineIn, (Nbed+3)>>2); 535 if (loadGenoLine) { 536 const int bedToGeno[4] = {2, 9, 1, 0}; 537 for (uint64 nbed = 0; nbed < Nbed; nbed++) 538 if (bedIndivToRemoveIndex[nbed] != -1) { 539 int genoValue = bedToGeno[(bedLineIn[nbed>>2]>>((nbed&3)<<1))&3]; 540 genoLine[bedIndivToRemoveIndex[nbed]] = (uchar) genoValue; 541 } 542 } 543 } 544 dosageValid(double dosage)545 bool SnpData::dosageValid(double dosage) { 546 const double eps = 1e-6; 547 return -eps <= dosage && dosage <= 2+eps; 548 } 549 computeAlleleFreq(const uchar genoLine[],const double subMaskIndivs[]) const550 double SnpData::computeAlleleFreq(const uchar genoLine[], const double subMaskIndivs[]) const { 551 double sum = 0; int num = 0; 552 for (size_t n = 0; n < N; n++) 553 if (subMaskIndivs[n] && genoLine[n] != 9) { 554 sum += genoLine[n]; 555 num++; 556 } 557 return 0.5 * sum / num; 558 } computeAlleleFreq(const double dosageLine[],const double subMaskIndivs[]) const559 double SnpData::computeAlleleFreq(const double dosageLine[], const double subMaskIndivs[]) 560 const { 561 double sum = 0; int num = 0; 562 for (size_t n = 0; n < N; n++) 563 if (subMaskIndivs[n] && dosageValid(dosageLine[n])) { 564 sum += dosageLine[n]; 565 num++; 566 } 567 return 0.5 * sum / num; 568 } 569 computeMAF(const uchar genoLine[],const double subMaskIndivs[]) const570 double SnpData::computeMAF(const uchar genoLine[], const double subMaskIndivs[]) const { 571 double alleleFreq = computeAlleleFreq(genoLine, subMaskIndivs); 572 return std::min(alleleFreq, 1.0-alleleFreq); 573 } 574 computeSnpMissing(const uchar genoLine[],const double subMaskIndivs[]) const575 double SnpData::computeSnpMissing(const uchar genoLine[], const double subMaskIndivs[]) const { 576 double sum = 0; int num = 0; 577 for (uint64 n = 0; n < N; n++) 578 if (subMaskIndivs[n]) { 579 sum += (genoLine[n] == 9); 580 num++; 581 } 582 return sum / num; 583 } computeSnpMissing(const double dosageLine[],const double subMaskIndivs[]) const584 double SnpData::computeSnpMissing(const double dosageLine[], const double subMaskIndivs[]) 585 const { 586 double sum = 0; int num = 0; 587 for (uint64 n = 0; n < N; n++) 588 if (subMaskIndivs[n]) { 589 sum += !dosageValid(dosageLine[n]); 590 num++; 591 } 592 return sum / num; 593 } 594 595 // assumes maskedSnpVector has dimension Nstride; zero-fills 596 // note alleleFreq != MAF: alleleFreq = (mean allele count) / 2 and has full range [0..1]! genoLineToMaskedSnpVector(double maskedSnpVector[],const uchar genoLine[],const double subMaskIndivs[],double alleleFreq) const597 void SnpData::genoLineToMaskedSnpVector(double maskedSnpVector[], const uchar genoLine[], 598 const double subMaskIndivs[], double alleleFreq) const { 599 for (size_t n = 0; n < N; n++) { 600 if (subMaskIndivs[n] && genoLine[n] != 9) 601 maskedSnpVector[n] = genoLine[n] - 2*alleleFreq; 602 else 603 maskedSnpVector[n] = 0; 604 } 605 for (uint64 n = N; n < Nstride; n++) 606 maskedSnpVector[n] = 0; 607 } 608 // assumes maskedSnpVector has dimension Nstride; zero-fills 609 // note alleleFreq != MAF: alleleFreq = (mean allele count) / 2 and has full range [0..1]! dosageLineToMaskedSnpVector(double dosageLineVec[],const double subMaskIndivs[],double alleleFreq) const610 void SnpData::dosageLineToMaskedSnpVector(double dosageLineVec[], const double subMaskIndivs[], 611 double alleleFreq) const { 612 for (size_t n = 0; n < N; n++) { 613 if (subMaskIndivs[n] && dosageValid(dosageLineVec[n])) 614 dosageLineVec[n] = dosageLineVec[n] - 2*alleleFreq; 615 else 616 dosageLineVec[n] = 0; 617 } 618 for (uint64 n = N; n < Nstride; n++) 619 dosageLineVec[n] = 0; 620 } 621 622 /** 623 * reads indiv info from fam file, snp info from bim file 624 * allocates memory, reads genotypes, and does QC 625 * assumes numbers of bim and bed files match 626 */ SnpData(const string & famFile,const vector<string> & bimFiles,const vector<string> & bedFiles,const string & geneticMapFile,const vector<string> & excludeFiles,const vector<string> & modelSnpsFiles,const vector<string> & removeFiles,double maxMissingPerSnp,double maxMissingPerIndiv,bool noMapCheck,vector<string> vcNamesIn,bool loadNonModelSnps,int _Nautosomes)627 SnpData::SnpData(const string &famFile, const vector <string> &bimFiles, 628 const vector <string> &bedFiles, const string &geneticMapFile, 629 const vector <string> &excludeFiles, const vector <string> &modelSnpsFiles, 630 const vector <string> &removeFiles, 631 double maxMissingPerSnp, double maxMissingPerIndiv, bool noMapCheck, 632 vector <string> vcNamesIn, bool loadNonModelSnps, int _Nautosomes) 633 : Nautosomes(_Nautosomes) { 634 635 processIndivs(famFile, removeFiles); 636 // bedSnps = all snps in PLINK data; will filter and QC to create class member 'snps' 637 vector <uint64> Mfiles; 638 vector <SnpInfo> bedSnps = processSnps(Mfiles, bimFiles, excludeFiles, modelSnpsFiles, 639 vcNamesIn, loadNonModelSnps); 640 processMap(bedSnps, geneticMapFile, noMapCheck); 641 642 // allocate genotypes 643 cout << "Allocating " << M << " x " << Nstride << "/4 bytes to store genotypes" << endl; 644 genotypes = ALIGNED_MALLOC_UCHARS(M * Nstride/4); // note: M will be reduced after QC 645 numIndivsQC = N; 646 647 cout << "Reading genotypes and performing QC filtering on snps and indivs..." << endl; 648 649 // read bed files; build final vector <SnpInfo> snps 650 vector <int> numMissingPerIndiv(N); 651 uchar *bedLineOut = genotypes; 652 uint64 mbed = 0; 653 654 for (uint i = 0; i < bedFiles.size(); i++) { 655 if (Mfiles[i] == 0) continue; 656 uint64 bytesInFile = Mfiles[i] * (uint64) ((Nbed+3)>>2); 657 cout << "Reading bed file #" << (i+1) << ": " << bedFiles[i] << endl; 658 cout << " Expecting " << bytesInFile << " (+3) bytes for " 659 << Nbed << " indivs, " << Mfiles[i] << " snps" << endl; 660 FileUtils::AutoGzIfstream fin; 661 fin.openOrExit(bedFiles[i], std::ios::in | std::ios::binary); 662 uchar header[3]; 663 fin.read((char *) header, 3); 664 if (!fin || header[0] != 0x6c || header[1] != 0x1b || header[2] != 0x01) { 665 cerr << "ERROR: Incorrect first three bytes of bed file: " << bedFiles[i] << endl; 666 exit(1); 667 } 668 669 // read genotypes 670 uchar *genoLine = ALIGNED_MALLOC_UCHARS(N); 671 uchar *bedLineIn = ALIGNED_MALLOC_UCHARS((Nbed+3)>>2); 672 int numSnpsFailedQC = 0; 673 for (uint64 mfile = 0; mfile < Mfiles[i]; mfile++, mbed++) { 674 readBedLine(genoLine, bedLineIn, fin, bedSnpToGrmIndex[mbed] != -2); 675 if (bedSnpToGrmIndex[mbed] != -2) { // not excluded 676 double snpMissing = computeSnpMissing(genoLine, maskIndivs); 677 bool snpPassQC = snpMissing <= maxMissingPerSnp; 678 if (snpPassQC) { 679 if (bedSnpToGrmIndex[mbed] >= 0) { // use in GRM 680 storeBedLine(bedLineOut, genoLine); 681 bedLineOut += Nstride>>2; 682 bedSnpToGrmIndex[mbed] = snps.size(); // reassign to final value 683 snps.push_back(bedSnps[mbed]); 684 snps.back().MAF = computeMAF(genoLine, maskIndivs); 685 // update indiv QC info 686 for (uint64 n = 0; n < N; n++) 687 if (genoLine[n] == 9) 688 numMissingPerIndiv[n]++; 689 } 690 // if bedSnpToGrmIndex[mbed] == -1 (don't use in GRM), leave as-is 691 } 692 else { 693 bedSnpToGrmIndex[mbed] = -2; // exclude 694 if (numSnpsFailedQC < 5) 695 cout << "Filtering snp " << bedSnps[mbed].ID << ": " 696 << snpMissing << " missing" << endl; 697 numSnpsFailedQC++; 698 } 699 } 700 } 701 ALIGNED_FREE(bedLineIn); 702 ALIGNED_FREE(genoLine); 703 704 if (numSnpsFailedQC) 705 cout << "Filtered " << numSnpsFailedQC << " SNPs with > " << maxMissingPerSnp << " missing" 706 << endl; 707 708 if (!fin || fin.get() != EOF) { 709 cerr << "ERROR: Wrong file size or reading error for bed file: " 710 << bedFiles[i] << endl; 711 exit(1); 712 } 713 fin.close(); 714 } 715 716 M = snps.size(); 717 718 // allocate and initialize maskSnps to all good 719 maskSnps = ALIGNED_MALLOC_UCHARS(M); 720 memset(maskSnps, 1, M*sizeof(maskSnps[0])); 721 722 // QC indivs for missingness 723 int numIndivsFailedQC = 0; 724 for (uint64 n = 0; n < N; n++) 725 if (maskIndivs[n] && numMissingPerIndiv[n] > maxMissingPerIndiv * M) { 726 maskIndivs[n] = 0; 727 numIndivsQC--; 728 if (numIndivsFailedQC < 5) 729 cout << "Filtering indiv " << indivs[n].famID << " " << indivs[n].indivID << ": " 730 << numMissingPerIndiv[n] << "/" << M << " missing" << endl; 731 numIndivsFailedQC++; 732 } 733 if (numIndivsFailedQC) 734 cout << "Filtered " << numIndivsFailedQC << " indivs with > " << maxMissingPerIndiv 735 << " missing" << endl; 736 737 cout << "Total indivs after QC: " << numIndivsQC << endl; 738 cout << "Total post-QC SNPs: M = " << M << endl; 739 //std::map <int, int> vcSnpCounts; 740 vector <int> vcSnpCounts(getNumVCs()+1); 741 for (uint64 m = 0; m < M; m++) 742 if (snps[m].vcNum >= 1) 743 vcSnpCounts[snps[m].vcNum]++; 744 for (uint v = 1; v < vcSnpCounts.size(); v++) { 745 cout << " Variance component " << v << ": " << vcSnpCounts[v] << " post-QC SNPs (name: '" 746 << vcNames[v] << "')" << endl; 747 if (vcSnpCounts[v] == 0) { 748 cerr << " ERROR: No post-QC SNPs left in component \"" << vcNames[v] << "\"" << endl; 749 cerr << " Remove this component from --modelSnps to perform analysis" << endl; 750 cerr << " Also remove from --remlGuessStr if providing variance estimates" << endl; 751 exit(1); 752 } 753 } 754 } 755 ~SnpData()756 SnpData::~SnpData() { 757 if (genotypes!=NULL) ALIGNED_FREE(genotypes); 758 ALIGNED_FREE(maskSnps); 759 ALIGNED_FREE(maskIndivs); 760 } 761 freeGenotypes()762 void SnpData::freeGenotypes() { 763 ALIGNED_FREE(genotypes); 764 genotypes = NULL; 765 } 766 readBimFile(const string & bimFile,int Nauto)767 vector <SnpInfo> SnpData::readBimFile(const string &bimFile, int Nauto) { 768 vector <SnpInfo> ret; 769 string line; 770 FileUtils::AutoGzIfstream fin; fin.openOrExit(bimFile); 771 int numOutOfOrder = 0; 772 while (getline(fin, line)) { 773 std::istringstream iss(line); 774 SnpInfo snp; string chrom_str; 775 if (!(iss >> chrom_str >> snp.ID >> snp.genpos >> snp.physpos >> snp.allele1 >> snp.allele2)) 776 { 777 cerr << "ERROR: Incorrectly formatted bim file: " << bimFile << endl; 778 cerr << "Line " << ret.size()+1 << ":" << endl; 779 cerr << line << endl; 780 cerr << "Unable to input 6 values (2 string, 1 double, 1 int, 2 string)" << endl; 781 exit(1); 782 } 783 snp.chrom = chrStrToInt(chrom_str, Nauto); 784 if (snp.chrom == -1) { 785 cerr << "ERROR: Unknown chromosome code in bim file: " << bimFile << endl; 786 cerr << "Line " << ret.size()+1 << ":" << endl; 787 cerr << line << endl; 788 exit(1); 789 } 790 if (!ret.empty() && 791 (snp.chrom < ret.back().chrom || 792 (snp.chrom == ret.back().chrom && (snp.physpos <= ret.back().physpos || 793 snp.genpos < ret.back().genpos)))) { 794 if (numOutOfOrder < 5) { 795 cerr << "WARNING: Out-of-order snp in bim file: " << bimFile << endl; 796 cerr << "Line " << ret.size()+1 << ":" << endl; 797 cerr << line << endl; 798 } 799 numOutOfOrder++; 800 //exit(1); 801 } 802 ret.push_back(snp); 803 } 804 if (numOutOfOrder) 805 cerr << "WARNING: Total number of out-of-order snps in bim file: " << numOutOfOrder << endl; 806 fin.close(); 807 return ret; 808 } 809 getM(void) const810 uint64 SnpData::getM(void) const { return M; } 811 // don't provide getN: don't want the rest of the program to even know N! getNstride(void) const812 uint64 SnpData::getNstride(void) const { return Nstride; } getNbed(void) const813 uint64 SnpData::getNbed(void) const { return Nbed; } getNumIndivsQC(void) const814 uint64 SnpData::getNumIndivsQC(void) const { return numIndivsQC; } getMaskIndivs(void) const815 const double* SnpData::getMaskIndivs(void) const { return maskIndivs; } writeMaskIndivs(double out[]) const816 void SnpData::writeMaskIndivs(double out[]) const { 817 memcpy(out, maskIndivs, Nstride*sizeof(maskIndivs[0])); 818 } writeMaskSnps(uchar out[]) const819 void SnpData::writeMaskSnps(uchar out[]) const { memcpy(out, maskSnps, M*sizeof(maskSnps[0])); } getSnpInfo(void) const820 const vector <SnpInfo> &SnpData::getSnpInfo(void) const { return snps; } getBedSnpToGrmIndex(void) const821 const vector <int> &SnpData::getBedSnpToGrmIndex(void) const { return bedSnpToGrmIndex; } getFamPhenos(void) const822 vector <double> SnpData::getFamPhenos(void) const { 823 vector <double> phenos(N); 824 for (uint64 n = 0; n < N; n++) phenos[n] = indivs[n].pheno; 825 return phenos; 826 } getMapAvailable(void) const827 bool SnpData::getMapAvailable(void) const { return mapAvailable; } getGenotypes(void) const828 const uchar* SnpData::getGenotypes(void) const { return genotypes; } getNumVCs(void) const829 int SnpData::getNumVCs(void) const { return vcNames.size()-1; } getVCnames(void) const830 std::vector <string> SnpData::getVCnames(void) const { return vcNames; } 831 832 /* 833 vector <double> getMAFs(void) const { 834 vector <double> MAFs(M); 835 for (uint64 m = 0; m < M; m++) MAFs[m] = snps[m].MAF;; 836 return MAFs; 837 } 838 */ getIndivInd(string & FID,string & IID) const839 uint64 SnpData::getIndivInd(string &FID, string &IID) const { 840 std::map <string, uint64>::const_iterator it = FID_IID_to_ind.find(FID+" "+IID); 841 if (it != FID_IID_to_ind.end()) 842 return it->second; 843 else 844 return IND_MISSING; 845 } 846 847 // check same chrom and within EITHER gen or phys distance isProximal(uint64 m1,uint64 m2,double genWindow,int physWindow) const848 bool SnpData::isProximal(uint64 m1, uint64 m2, double genWindow, int physWindow) const { 849 if (snps[m1].chrom != snps[m2].chrom) return false; 850 return (mapAvailable && fabs(snps[m1].genpos - snps[m2].genpos) < genWindow) || 851 abs(snps[m1].physpos - snps[m2].physpos) < physWindow; 852 } 853 854 // note: pheno may have size Nstride, but we only output N values writeFam(const string & outFile,const vector<double> & pheno) const855 void SnpData::writeFam(const string &outFile, const vector <double> &pheno) const { 856 FileUtils::AutoGzOfstream fout; fout.openOrExit(outFile); 857 for (uint64 n = 0; n < N; n++) 858 fout << indivs[n].famID << "\t" << indivs[n].indivID << "\t" << indivs[n].paternalID << "\t" 859 << indivs[n].maternalID << "\t" << indivs[n].sex << "\t" 860 << std::setprecision(10) << pheno[n] << endl; 861 fout.close(); 862 } 863 864 /* 865 // creates map <string, double> and goes through snpInfo 866 // plain text, 2-col: rs, LDscore; # to ignore 867 void loadLDscore(const string &file) { 868 // clear existing LDscore (set to NaN/-1?) 869 } 870 871 // use LDscore regression to calibrate stats[] 872 void calibrateStats(const double stats[]) const { 873 874 } 875 */ 876 877 // writes Nstride values to out[] 878 // assumes subMaskIndivs is a sub-mask of maskIndivs 879 // (presumably obtained by using writeMaskIndivs and taking a subset) 880 // work: 256x4 aligned work array buildMaskedSnpVector(double out[],const double subMaskIndivs[],uint64 m,const double lut0129[4],double (* work)[4]) const881 void SnpData::buildMaskedSnpVector(double out[], const double subMaskIndivs[], uint64 m, 882 const double lut0129[4], double (*work)[4]) const { 883 884 /* lookupBedCode[4] = { value of 2 effect alleles (bed 00 = 2 effect alleles), 885 value of missing (bed 01 = missing), 886 value of 1 effect allele (bed 10 = 1 effect allele), 887 value of 0 effect alleles (bed 11 = 0 effect alleles) } */ 888 double lookupBedCode[4] = {lut0129[2], lut0129[3], lut0129[1], lut0129[0]}; 889 buildByteLookup(work, lookupBedCode); 890 891 uchar *ptr = genotypes + m * (Nstride>>2); 892 for (uint64 n4 = 0; n4 < Nstride; n4 += 4) { 893 #ifdef USE_SSE // todo: add AVX instructions to do all at once? 894 __m128d x01 = _mm_load_pd(&work[*ptr][0]); 895 __m128d x23 = _mm_load_pd(&work[*ptr][2]); 896 __m128d mask01 = _mm_load_pd(&subMaskIndivs[n4]); 897 __m128d mask23 = _mm_load_pd(&subMaskIndivs[n4+2]); 898 _mm_store_pd(&out[n4], _mm_mul_pd(x01, mask01)); 899 _mm_store_pd(&out[n4+2], _mm_mul_pd(x23, mask23)); 900 #else 901 // non-lookup approach 902 /* 903 uchar g = *ptr; 904 out[n4] = lookupBedCode[g&3] * subMaskIndivs[n4]; 905 out[n4+1] = lookupBedCode[(g>>2)&3] * subMaskIndivs[n4+1]; 906 out[n4+2] = lookupBedCode[(g>>4)&3] * subMaskIndivs[n4+2]; 907 out[n4+3] = lookupBedCode[(g>>6)&3] * subMaskIndivs[n4+3]; 908 */ 909 memcpy(out + n4, work[*ptr], sizeof(work[0])); 910 for (int k = 0; k < 4; k++) 911 out[n4+k] *= subMaskIndivs[n4+k]; 912 #endif 913 ptr++; 914 } 915 } 916 917 /** 918 * performs fast rough computation of LD Scores among chip snps to use in regression weights 919 * approximations: 920 * - only uses a subsample of individuals 921 * - replaces missing genotypes with means and uses them in dot product 922 * - computes adjusted r^2 using 1/n correction instead of baseline 923 */ estChipLDscores(uint64 sampleSize) const924 vector <double> SnpData::estChipLDscores(uint64 sampleSize) const { 925 if (sampleSize > numIndivsQC) { 926 cerr << "WARNING: Only " << numIndivsQC << " indivs available; using all" << endl; 927 sampleSize = numIndivsQC; 928 } 929 if (sampleSize & 7) { 930 sampleSize &= ~(uint64) 7; 931 cout << "Reducing sample size to " << sampleSize << " for memory alignment" << endl; 932 } 933 934 // choose sampleSize indivs from maskIndivs 935 uint64 step = numIndivsQC / sampleSize; 936 vector <int> indivInds; 937 for (uint64 n = 0, nGood = 0; n < Nstride; n++) 938 if (maskIndivs[n]) { 939 if (nGood % step == 0) { 940 indivInds.push_back(n); 941 if (indivInds.size() == sampleSize) 942 break; 943 } 944 nGood++; 945 } 946 947 vector <double> chipLDscores(M, NAN); 948 for (int chrom = 1; chrom <= Nautosomes+1; chrom++) 949 estChipLDscoresChrom(chipLDscores, chrom, indivInds); 950 return chipLDscores; 951 } 952 953 954 955 956 // note n <= N because of potential missing data/masking compute_r2(const int x[],const int y[],int dim) const957 double SnpData::compute_r2(const int x[], const int y[], int dim) const { 958 int n = 0; 959 double avg_x = 0, avg_y = 0; 960 for (int i = 0; i < dim; i++) 961 if (x[i] != 9 && y[i] != 9) { 962 avg_x += x[i]; 963 avg_y += y[i]; 964 n++; 965 } 966 avg_x /= n; avg_y /= n; 967 double sum_xx = 0, sum_xy = 0, sum_yy = 0; 968 for (int i = 0; i < dim; i++) 969 if (x[i] != 9 && y[i] != 9) { 970 double xi = x[i]-avg_x; 971 double yi = y[i]-avg_y; 972 sum_xx += xi*xi; 973 sum_xy += xi*yi; 974 sum_yy += yi*yi; 975 } 976 double r2 = sum_xy*sum_xy / (sum_xx*sum_yy); 977 return r2; 978 //return r2 - (1-r2)/(n-2); adjusted r2 979 } 980 981 // fill Nstride-element array with 0, 1, 2, 9; missing or mask => 9 fillSnpRow(int x[],uint64 m) const982 void SnpData::fillSnpRow(int x[], uint64 m) const { 983 /* lookupBedCode[4] = {value of 0, value of missing, value of 1, value of 2} */ 984 const int lookupBedCode[4] = {0, 9, 1, 2}; 985 for (uint64 n4 = 0; n4 < Nstride; n4 += 4) { 986 uchar byte = genotypes[m * (Nstride>>2) + (n4>>2)]; 987 for (int k = 0; k < 4; k++) 988 x[n4+k] = maskIndivs[n4+k] ? lookupBedCode[(byte>>(k+k))&3] : 9; 989 } 990 } 991 augmentLDscores(uint64 m,const vector<int> & mRow,uint64 mp,const vector<int> & mpRow,const vector<std::pair<double,int>> & windows,const vector<double> & alphaMAFdeps,vector<double> & LDscores,vector<double> & windowCounts) const992 bool SnpData::augmentLDscores(uint64 m, const vector <int> &mRow, uint64 mp, const vector <int> &mpRow, 993 const vector < std::pair <double, int> > &windows, 994 const vector <double> &alphaMAFdeps, vector <double> &LDscores, 995 vector <double> &windowCounts) const { 996 997 int W = windows.size(); 998 int A = alphaMAFdeps.size(); 999 double r2 = compute_r2(&mRow[0], &mpRow[0], Nstride); 1000 bool foundProximal = false; 1001 for (int w = 0; w < W; w++) 1002 if (isProximal(m, mp, windows[w].first, windows[w].second)) { 1003 foundProximal = true; 1004 if (!std::isnan(r2)) { 1005 for (int a = 0; a < A; a++) { 1006 double weight = pow((snps[mp].MAF * (1-snps[mp].MAF)), alphaMAFdeps[a]); 1007 LDscores[w*A+a] += weight * r2; 1008 windowCounts[w*A+a] += weight; 1009 } 1010 } 1011 } 1012 return foundProximal; 1013 } 1014 computeLDscore(uint64 m,const vector<std::pair<double,int>> & windows,vector<double> & windowCounts,double & baselineR2,const RestrictSnpSet & restrictPartnerSet,const vector<double> & alphaMAFdeps) const1015 vector <double> SnpData::computeLDscore(uint64 m, const vector < std::pair <double, int> > &windows, 1016 vector <double> &windowCounts, double &baselineR2, 1017 const RestrictSnpSet &restrictPartnerSet, 1018 const vector <double> &alphaMAFdeps) const { 1019 1020 int W = windows.size(); 1021 int A = alphaMAFdeps.size(); 1022 vector <double> LDscores(W*A); 1023 windowCounts = vector <double> (W*A); 1024 1025 vector <int> mRow(Nstride), mpRow(Nstride); 1026 fillSnpRow(&mRow[0], m); 1027 1028 int winSnps1 = 0, winSnps2 = 0; 1029 for (int mp = (int) m; mp >= 0; mp--) { 1030 if (snps[m].chrom != snps[mp].chrom) break; 1031 if (!restrictPartnerSet.isAllowed(snps[mp])) continue; 1032 fillSnpRow(&mpRow[0], mp); 1033 if (!augmentLDscores(m, mRow, mp, mpRow, windows, alphaMAFdeps, LDscores, windowCounts)) 1034 break; 1035 winSnps1++; 1036 } 1037 for (int mp = m+1; mp < (int) M; mp++) { 1038 if (snps[m].chrom != snps[mp].chrom) break; 1039 if (!restrictPartnerSet.isAllowed(snps[mp])) continue; 1040 fillSnpRow(&mpRow[0], mp); 1041 if (!augmentLDscores(m, mRow, mp, mpRow, windows, alphaMAFdeps, LDscores, windowCounts)) 1042 break; 1043 winSnps2++; 1044 } 1045 1046 // compute baseline average LD to random off-chrom SNPs 1047 //double maxWindowCount = *std::max_element(windowCounts.begin(), windowCounts.end()); 1048 //uint64 mStep = M / (int) maxWindowCount; // max will be for alpha=0 (actual integer count) 1049 uint64 mStep = M / (winSnps1 + winSnps2); 1050 double totOffChrom_r2s = 0; 1051 int numOffChrom_r2s = 0; 1052 for (uint64 mp = m % mStep; mp < M; mp += mStep) 1053 if (snps[mp].chrom != snps[m].chrom) { 1054 fillSnpRow(&mpRow[0], mp); 1055 double r2 = compute_r2(&mRow[0], &mpRow[0], Nstride); 1056 if (!std::isnan(r2)) { 1057 totOffChrom_r2s += r2; 1058 numOffChrom_r2s++; 1059 } 1060 } 1061 baselineR2 = totOffChrom_r2s / numOffChrom_r2s; 1062 for (int w = 0; w < W; w++) 1063 for (int a = 0; a < A; a++) 1064 LDscores[w*A+a] = (1+baselineR2) * LDscores[w*A+a] - windowCounts[w*A+a] * baselineR2; 1065 return LDscores; 1066 } 1067 1068 }; 1069