1 /*
2 Genome-wide Efficient Mixed Model Association (GEMMA)
3 Copyright © 2011-2017, Xiang Zhou
4 Copyright © 2017, Peter Carbonetto
5 Copyright © 2017, Pjotr Prins
6
7 This program is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation, either version 3 of the License, or
10 (at your option) any later version.
11
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <http://www.gnu.org/licenses/>.
19 */
20
21 #include <iostream>
22 #include <iomanip>
23 #include <string>
24 #include <algorithm>
25 #include <cmath>
26 #include <cstring>
27 #include <fstream>
28 #include <sys/stat.h>
29
30 #include "gsl/gsl_blas.h"
31 #include "gsl/gsl_linalg.h"
32 #include "gsl/gsl_matrix.h"
33 #include "gsl/gsl_matrix.h"
34 #include "gsl/gsl_randist.h"
35 #include "gsl/gsl_vector.h"
36
37 #include "eigenlib.h"
38 #include "gemma.h"
39 #include "gemma_io.h"
40 #include "mathfunc.h"
41 #include "param.h"
42 #include "fastblas.h"
43
44 using namespace std;
45
46 // ---- Helper functions which do not use the PARAM scope
47
48 // Calculate the SNP sets as used in LOCO from mapchr which was filled
49 // from the annotation file. Returns ksnps (used for K) and gwasnps (used for
50 // GWA). Both are complimentary with each other and subsets of setSnps.
51
LOCO_set_Snps(set<string> & ksnps,set<string> & gwasnps,const map<string,string> mapchr,const string loco)52 void LOCO_set_Snps(set<string> &ksnps, set<string> &gwasnps,
53 const map<string, string> mapchr, const string loco) {
54 enforce_msg(ksnps.size() == 0, "make sure knsps is not initialized twice");
55 enforce_msg(gwasnps.size() == 0,
56 "make sure gwasnps is not initialized twice");
57 for (auto &kv : mapchr) {
58 auto snp = kv.first;
59 auto chr = kv.second;
60 if (chr != loco) {
61 ksnps.insert(snp);
62 } else {
63 gwasnps.insert(snp);
64 }
65 }
66 }
67
68 // Trim #individuals to size which is used to write tests that run faster
69 //
70 // Note it actually trims the number of functional individuals
71 // (indicator_idv[x] == 1). This should match indicator_cvt etc. If
72 // this gives problems with certain sets we can simply trim to size.
73
trim_individuals(vector<int> & idvs,size_t ni_max)74 void trim_individuals(vector<int> &idvs, size_t ni_max) {
75 if (ni_max) {
76 size_t count = 0;
77 for (auto ind = idvs.begin(); ind != idvs.end(); ++ind) {
78 if (*ind)
79 count++;
80 if (count >= ni_max)
81 break;
82 }
83 if (count != idvs.size()) {
84 if (is_debug_mode())
85 cout << "**** TEST MODE: trim individuals from " << idvs.size()
86 << " to " << count << endl;
87 idvs.resize(count);
88 }
89 }
90 }
91
92 // ---- PARAM class implementation
93
PARAM(void)94 PARAM::PARAM(void)
95 : a_mode(0), k_mode(1), d_pace(DEFAULT_PACE),
96 file_out("result"), path_out("./output/"), miss_level(0.05),
97 maf_level(0.01), hwe_level(0), r2_level(0.9999), l_min(1e-5), l_max(1e5),
98 n_region(10), p_nr(0.001), em_prec(0.0001), nr_prec(0.0001),
99 em_iter(10000), nr_iter(100), crt(0), pheno_mean(0), noconstrain(false),
100 h_min(-1), h_max(-1), h_scale(-1), rho_min(0.0), rho_max(1.0),
101 rho_scale(-1), logp_min(0.0), logp_max(0.0), logp_scale(-1), h_ngrid(10),
102 rho_ngrid(10), s_min(0), s_max(300), w_step(100000), s_step(1000000),
103 r_pace(10), w_pace(1000), n_accept(0), n_mh(10), geo_mean(2000.0),
104 randseed(-1), window_cm(0), window_bp(0), window_ns(0), n_block(200),
105 error(false), ni_subsample(0), n_cvt(1), n_cat(0), n_vc(1),
106 time_total(0.0), time_G(0.0), time_eigen(0.0), time_UtX(0.0),
107 time_UtZ(0.0), time_opt(0.0), time_Omega(0.0) {}
108
~PARAM()109 PARAM::~PARAM() {
110 gsl_rng_free(gsl_r);
111 }
112
113 // Read files: obtain ns_total, ng_total, ns_test, ni_test.
ReadFiles(void)114 void PARAM::ReadFiles(void) {
115 string file_str;
116
117 // Read cat file.
118 if (!file_mcat.empty()) {
119 if (ReadFile_mcat(file_mcat, mapRS2cat, n_vc) == false) {
120 error = true;
121 }
122 } else if (!file_cat.empty()) {
123 if (ReadFile_cat(file_cat, mapRS2cat, n_vc) == false) {
124 error = true;
125 }
126 }
127
128 // Read snp weight files.
129 if (!file_wcat.empty()) {
130 if (ReadFile_wsnp(file_wcat, n_vc, mapRS2wcat) == false) {
131 error = true;
132 }
133 }
134 if (!file_wsnp.empty()) {
135 if (ReadFile_wsnp(file_wsnp, mapRS2wsnp) == false) {
136 error = true;
137 }
138 }
139
140 // Count number of kinship files.
141 if (!file_mk.empty()) {
142 if (CountFileLines(file_mk, n_vc) == false) {
143 error = true;
144 }
145 }
146
147 // Read SNP set.
148 if (!file_snps.empty()) {
149 if (ReadFile_snps(file_snps, setSnps) == false) {
150 error = true;
151 }
152 } else {
153 setSnps.clear();
154 }
155
156 // Read KSNP set.
157 if (!file_ksnps.empty()) {
158 if (ReadFile_snps(file_ksnps, setKSnps) == false) {
159 error = true;
160 }
161 } else {
162 setKSnps.clear();
163 }
164
165 // For prediction.
166 if (!file_epm.empty()) {
167 if (ReadFile_est(file_epm, est_column, mapRS2est) == false) {
168 error = true;
169 }
170 if (!file_bfile.empty()) {
171 file_str = file_bfile + ".bim";
172 if (ReadFile_bim(file_str, snpInfo) == false) {
173 error = true;
174 }
175 file_str = file_bfile + ".fam";
176 if (ReadFile_fam(file_str, indicator_pheno, pheno, mapID2num, p_column) ==
177 false) {
178 error = true;
179 }
180 }
181
182 if (!file_geno.empty()) {
183 if (ReadFile_pheno(file_pheno, indicator_pheno, pheno, p_column) ==
184 false) {
185 error = true;
186 }
187
188 if (CountFileLines(file_geno, ns_total) == false) {
189 error = true;
190 }
191 }
192
193 if (!file_ebv.empty()) {
194 if (ReadFile_column(file_ebv, indicator_bv, vec_bv, 1) == false) {
195 error = true;
196 }
197 }
198
199 if (!file_log.empty()) {
200 if (ReadFile_log(file_log, pheno_mean) == false) {
201 error = true;
202 }
203 }
204
205 // Convert indicator_pheno to indicator_idv.
206 int k = 1;
207 for (size_t i = 0; i < indicator_pheno.size(); i++) {
208 k = 1;
209 for (size_t j = 0; j < indicator_pheno[i].size(); j++) {
210 if (indicator_pheno[i][j] == 0) {
211 k = 0;
212 }
213 }
214 indicator_idv.push_back(k);
215 }
216
217 ns_test = 0;
218
219 return;
220 }
221
222 // Read covariates before the genotype files.
223 if (!file_cvt.empty()) {
224 if (ReadFile_cvt(file_cvt, indicator_cvt, cvt, n_cvt) == false) {
225 error = true;
226 }
227 if ((indicator_cvt).size() == 0) {
228 n_cvt = 1;
229 }
230 } else {
231 n_cvt = 1;
232 }
233 trim_individuals(indicator_cvt, ni_max);
234
235 if (!file_gxe.empty()) {
236 if (ReadFile_column(file_gxe, indicator_gxe, gxe, 1) == false) {
237 error = true;
238 }
239 }
240 if (!file_weight.empty()) {
241 if (ReadFile_column(file_weight, indicator_weight, weight, 1) == false) {
242 error = true;
243 }
244 }
245
246 trim_individuals(indicator_idv, ni_max);
247
248 // Read genotype and phenotype file for PLINK format.
249 if (!file_bfile.empty()) {
250 file_str = file_bfile + ".bim";
251 snpInfo.clear();
252 if (ReadFile_bim(file_str, snpInfo) == false) {
253 error = true;
254 }
255
256 // If both fam file and pheno files are used, use
257 // phenotypes inside the pheno file.
258 if (!file_pheno.empty()) {
259
260 // Phenotype file before genotype file.
261 if (ReadFile_pheno(file_pheno, indicator_pheno, pheno, p_column) ==
262 false) {
263 error = true;
264 }
265 } else {
266 file_str = file_bfile + ".fam";
267 if (ReadFile_fam(file_str, indicator_pheno, pheno, mapID2num, p_column) ==
268 false) {
269 error = true;
270 }
271 }
272
273 // Post-process covariates and phenotypes, obtain
274 // ni_test, save all useful covariates.
275 ProcessCvtPhen();
276
277 // Obtain covariate matrix.
278 auto W1 = gsl_matrix_safe_alloc(ni_test, n_cvt);
279 CopyCvt(W1);
280
281 file_str = file_bfile + ".bed";
282 if (ReadFile_bed(file_str, setSnps, W1, indicator_idv, indicator_snp,
283 snpInfo, maf_level, miss_level, hwe_level, r2_level,
284 ns_test) == false) {
285 error = true;
286 }
287 gsl_matrix_free(W1);
288 ns_total = indicator_snp.size();
289 }
290
291 // Read genotype and phenotype file for BIMBAM format.
292 if (!file_geno.empty()) {
293
294 // Annotation file before genotype file.
295 if (!file_anno.empty()) {
296 if (ReadFile_anno(file_anno, mapRS2chr, mapRS2bp, mapRS2cM) == false) {
297 error = true;
298 }
299 }
300
301 // Phenotype file before genotype file.
302 if (ReadFile_pheno(file_pheno, indicator_pheno, pheno, p_column) == false) {
303 error = true;
304 }
305
306 // Post-process covariates and phenotypes, obtain
307 // ni_test, save all useful covariates.
308 ProcessCvtPhen();
309
310 // Obtain covariate matrix.
311 auto W2 = gsl_matrix_safe_alloc(ni_test, n_cvt);
312 CopyCvt(W2);
313
314 trim_individuals(indicator_idv, ni_max);
315 trim_individuals(indicator_cvt, ni_max);
316 if (ReadFile_geno(file_geno, setSnps, W2, indicator_idv, indicator_snp,
317 maf_level, miss_level, hwe_level, r2_level, mapRS2chr,
318 mapRS2bp, mapRS2cM, snpInfo, ns_test) == false) {
319 error = true;
320 return;
321 }
322 gsl_matrix_free(W2);
323
324 ns_total = indicator_snp.size();
325 }
326
327 // Read genotype file for multiple PLINK files.
328 if (!file_mbfile.empty()) {
329 igzstream infile(file_mbfile.c_str(), igzstream::in);
330 enforce_msg(infile,"fail to open mbfile file");
331 string file_name;
332 size_t t = 0, ns_test_tmp = 0;
333 gsl_matrix *W3 = NULL;
334 while (!safeGetline(infile, file_name).eof()) {
335 file_str = file_name + ".bim";
336
337 if (ReadFile_bim(file_str, snpInfo) == false) {
338 error = true;
339 }
340
341 if (t == 0) {
342
343 // If both fam file and pheno files are used, use
344 // phenotypes inside the pheno file.
345 if (!file_pheno.empty()) {
346
347 // Phenotype file before genotype file.
348 if (ReadFile_pheno(file_pheno, indicator_pheno, pheno, p_column) ==
349 false) {
350 error = true;
351 }
352 } else {
353 file_str = file_name + ".fam";
354 if (ReadFile_fam(file_str, indicator_pheno, pheno, mapID2num,
355 p_column) == false) {
356 error = true;
357 }
358 }
359
360 // Post-process covariates and phenotypes, obtain
361 // ni_test, save all useful covariates.
362 ProcessCvtPhen();
363
364 // Obtain covariate matrix.
365 W3 = gsl_matrix_safe_alloc(ni_test, n_cvt);
366 CopyCvt(W3);
367 }
368
369 file_str = file_name + ".bed";
370 if (ReadFile_bed(file_str, setSnps, W3, indicator_idv, indicator_snp,
371 snpInfo, maf_level, miss_level, hwe_level, r2_level,
372 ns_test_tmp) == false) {
373 error = true;
374 }
375 mindicator_snp.push_back(indicator_snp);
376 msnpInfo.push_back(snpInfo);
377 ns_test += ns_test_tmp;
378 ns_total += indicator_snp.size();
379
380 t++;
381 }
382
383 if (W3) gsl_matrix_free(W3);
384
385 infile.close();
386 infile.clear();
387 }
388
389 // Read genotype and phenotype file for multiple BIMBAM files.
390 if (!file_mgeno.empty()) {
391
392 // Annotation file before genotype file.
393 if (!file_anno.empty()) {
394 if (ReadFile_anno(file_anno, mapRS2chr, mapRS2bp, mapRS2cM) == false) {
395 error = true;
396 }
397 }
398
399 // Phenotype file before genotype file.
400 if (ReadFile_pheno(file_pheno, indicator_pheno, pheno, p_column) == false) {
401 error = true;
402 }
403
404 // Post-process covariates and phenotypes, obtain ni_test,
405 // save all useful covariates.
406 ProcessCvtPhen();
407
408 // Obtain covariate matrix.
409 gsl_matrix *W4 = gsl_matrix_safe_alloc(ni_test, n_cvt);
410 CopyCvt(W4);
411
412 igzstream infile(file_mgeno.c_str(), igzstream::in);
413 if (!infile) {
414 cout << "error! fail to open mgeno file: " << file_mgeno << endl;
415 error = true;
416 return;
417 }
418
419 string file_name;
420 size_t ns_test_tmp;
421 while (!safeGetline(infile, file_name).eof()) {
422 if (ReadFile_geno(file_name, setSnps, W4, indicator_idv, indicator_snp,
423 maf_level, miss_level, hwe_level, r2_level, mapRS2chr,
424 mapRS2bp, mapRS2cM, snpInfo, ns_test_tmp) == false) {
425 error = true;
426 }
427
428 mindicator_snp.push_back(indicator_snp);
429 msnpInfo.push_back(snpInfo);
430 ns_test += ns_test_tmp;
431 ns_total += indicator_snp.size();
432 }
433
434 gsl_matrix_free(W4);
435
436 infile.close();
437 infile.clear();
438 }
439
440 if (!file_gene.empty()) {
441 if (ReadFile_pheno(file_pheno, indicator_pheno, pheno, p_column) == false) {
442 error = true;
443 }
444
445 // Convert indicator_pheno to indicator_idv.
446 int k = 1;
447 for (size_t i = 0; i < indicator_pheno.size(); i++) {
448 k = 1;
449 for (size_t j = 0; j < indicator_pheno[i].size(); j++) {
450 if (indicator_pheno[i][j] == 0) {
451 k = 0;
452 }
453 }
454 indicator_idv.push_back(k);
455 }
456
457 // Post-process covariates and phenotypes, obtain
458 // ni_test, save all useful covariates.
459 ProcessCvtPhen();
460
461 // Obtain covariate matrix.
462 // gsl_matrix *W5 = gsl_matrix_alloc(ni_test, n_cvt);
463 // CopyCvt(W5);
464
465 if (ReadFile_gene(file_gene, vec_read, snpInfo, ng_total) == false) {
466 error = true;
467 }
468 }
469
470 // Read is after gene file.
471 if (!file_read.empty()) {
472 if (ReadFile_column(file_read, indicator_read, vec_read, 1) == false) {
473 error = true;
474 }
475
476 ni_test = 0;
477 for (vector<int>::size_type i = 0; i < (indicator_idv).size(); ++i) {
478 indicator_idv[i] *= indicator_read[i];
479 ni_test += indicator_idv[i];
480 }
481
482 enforce_msg(ni_test,"number of analyzed individuals equals 0.");
483 }
484
485 // For ridge prediction, read phenotype only.
486 if (file_geno.empty() && file_gene.empty() && !file_pheno.empty()) {
487 if (ReadFile_pheno(file_pheno, indicator_pheno, pheno, p_column) == false) {
488 error = true;
489 }
490
491 // Post-process covariates and phenotypes, obtain
492 // ni_test, save all useful covariates.
493 ProcessCvtPhen();
494 }
495
496 // Compute setKSnps when -loco is passed in
497 if (!loco.empty()) {
498 LOCO_set_Snps(setKSnps, setGWASnps, mapRS2chr, loco);
499 }
500 return;
501 }
502
CheckParam(void)503 void PARAM::CheckParam(void) {
504 struct stat fileInfo;
505 string str;
506
507 // Check parameters.
508 if (k_mode != 1 && k_mode != 2) {
509 cout << "error! unknown kinship/relatedness input mode: " << k_mode << endl;
510 error = true;
511 }
512 if (a_mode != 1 && a_mode != 2 && a_mode != 3 && a_mode != 4 && a_mode != 5 &&
513 a_mode != 11 && a_mode != 12 && a_mode != 13 && a_mode != 14 &&
514 a_mode != 15 && a_mode != 21 && a_mode != 22 && a_mode != 25 &&
515 a_mode != 26 && a_mode != 27 && a_mode != 28 && a_mode != 31 &&
516 a_mode != 41 && a_mode != 42 && a_mode != 43 && a_mode != 51 &&
517 a_mode != 52 && a_mode != 53 && a_mode != 54 && a_mode != 61 &&
518 a_mode != 62 && a_mode != 63 && a_mode != 66 && a_mode != 67 &&
519 a_mode != 71) {
520 cout << "error! unknown analysis mode: " << a_mode
521 << ". make sure -gk or -eigen or -lmm or -bslmm -predict or "
522 << "-calccov is specified correctly." << endl;
523 error = true;
524 }
525 if (miss_level > 1) {
526 cout << "error! missing level needs to be between 0 and 1. "
527 << "current value = " << miss_level << endl;
528 error = true;
529 }
530 if (maf_level > 0.5) {
531 cout << "error! maf level needs to be between 0 and 0.5. "
532 << "current value = " << maf_level << endl;
533 error = true;
534 }
535 if (hwe_level > 1) {
536 cout << "error! hwe level needs to be between 0 and 1. "
537 << "current value = " << hwe_level << endl;
538 error = true;
539 }
540 if (r2_level > 1) {
541 cout << "error! r2 level needs to be between 0 and 1. "
542 << "current value = " << r2_level << endl;
543 error = true;
544 }
545
546 if (l_max < l_min) {
547 cout << "error! maximum lambda value must be larger than the "
548 << "minimal value. current values = " << l_max << " and " << l_min
549 << endl;
550 error = true;
551 }
552 if (h_max < h_min) {
553 cout << "error! maximum h value must be larger than the minimal "
554 << "value. current values = " << h_max << " and " << h_min << endl;
555 error = true;
556 }
557 if (s_max < s_min) {
558 cout << "error! maximum s value must be larger than the minimal "
559 << "value. current values = " << s_max << " and " << s_min << endl;
560 error = true;
561 }
562 if (rho_max < rho_min) {
563 cout << "error! maximum rho value must be larger than the"
564 << "minimal value. current values = " << rho_max << " and " << rho_min
565 << endl;
566 error = true;
567 }
568 if (logp_max < logp_min) {
569 cout << "error! maximum logp value must be larger than the "
570 << "minimal value. current values = " << logp_max / log(10) << " and "
571 << logp_min / log(10) << endl;
572 error = true;
573 }
574
575 if (h_max > 1) {
576 cout << "error! h values must be bewtween 0 and 1. current "
577 << "values = " << h_max << " and " << h_min << endl;
578 error = true;
579 }
580 if (rho_max > 1) {
581 cout << "error! rho values must be between 0 and 1. current "
582 << "values = " << rho_max << " and " << rho_min << endl;
583 error = true;
584 }
585 if (logp_max > 0) {
586 cout << "error! maximum logp value must be smaller than 0. "
587 << "current values = " << logp_max / log(10) << " and "
588 << logp_min / log(10) << endl;
589 error = true;
590 }
591 if (l_max < l_min) {
592 cout << "error! maximum lambda value must be larger than the "
593 << "minimal value. current values = " << l_max << " and " << l_min
594 << endl;
595 error = true;
596 }
597
598 if (h_scale > 1.0) {
599 cout << "error! hscale value must be between 0 and 1. "
600 << "current value = " << h_scale << endl;
601 error = true;
602 }
603 if (rho_scale > 1.0) {
604 cout << "error! rscale value must be between 0 and 1. "
605 << "current value = " << rho_scale << endl;
606 error = true;
607 }
608 if (logp_scale > 1.0) {
609 cout << "error! pscale value must be between 0 and 1. "
610 << "current value = " << logp_scale << endl;
611 error = true;
612 }
613
614 if (rho_max == 1 && rho_min == 1 && a_mode == 12) {
615 cout << "error! ridge regression does not support a rho "
616 << "parameter. current values = " << rho_max << " and " << rho_min
617 << endl;
618 error = true;
619 }
620
621 if (window_cm < 0) {
622 cout << "error! windowcm values must be non-negative. "
623 << "current values = " << window_cm << endl;
624 error = true;
625 }
626
627 if (window_cm == 0 && window_bp == 0 && window_ns == 0) {
628 window_bp = 1000000;
629 }
630
631 // Check p_column, and (no need to) sort p_column into
632 // ascending order.
633 if (p_column.size() == 0) {
634 p_column.push_back(1);
635 } else {
636 for (size_t i = 0; i < p_column.size(); i++) {
637 for (size_t j = 0; j < i; j++) {
638 if (p_column[i] == p_column[j]) {
639 cout << "error! identical phenotype "
640 << "columns: " << p_column[i] << endl;
641 error = true;
642 }
643 }
644 }
645 }
646
647 n_ph = p_column.size();
648
649 // Only LMM option (and one prediction option) can deal with
650 // multiple phenotypes and no gene expression files.
651 if (n_ph > 1 && a_mode != 1 && a_mode != 2 && a_mode != 3 && a_mode != 4 &&
652 a_mode != 43) {
653 cout << "error! the current analysis mode " << a_mode
654 << " can not deal with multiple phenotypes." << endl;
655 error = true;
656 }
657 if (n_ph > 1 && !file_gene.empty()) {
658 cout << "error! multiple phenotype analysis option not "
659 << "allowed with gene expression files. " << endl;
660 error = true;
661 }
662
663 if (p_nr > 1) {
664 cout << "error! pnr value must be between 0 and 1. current value = " << p_nr
665 << endl;
666 error = true;
667 }
668
669 // check est_column
670 if (est_column.size() == 0) {
671 if (file_ebv.empty()) {
672 est_column.push_back(2);
673 est_column.push_back(5);
674 est_column.push_back(6);
675 est_column.push_back(7);
676 } else {
677 est_column.push_back(2);
678 est_column.push_back(0);
679 est_column.push_back(6);
680 est_column.push_back(7);
681 }
682 }
683
684 if (est_column.size() != 4) {
685 cout << "error! -en not followed by four numbers. current number = "
686 << est_column.size() << endl;
687 error = true;
688 }
689 if (est_column[0] == 0) {
690 cout << "error! -en rs column can not be zero. current number = "
691 << est_column.size() << endl;
692 error = true;
693 }
694
695 // Check if files are compatible with each other, and if files exist.
696 if (!file_bfile.empty()) {
697 str = file_bfile + ".bim";
698 if (stat(str.c_str(), &fileInfo) == -1) {
699 cout << "error! fail to open .bim file: " << str << endl;
700 error = true;
701 }
702 str = file_bfile + ".bed";
703 if (stat(str.c_str(), &fileInfo) == -1) {
704 cout << "error! fail to open .bed file: " << str << endl;
705 error = true;
706 }
707 str = file_bfile + ".fam";
708 if (stat(str.c_str(), &fileInfo) == -1) {
709 cout << "error! fail to open .fam file: " << str << endl;
710 error = true;
711 }
712 }
713
714 if ((!file_geno.empty() || !file_gene.empty())) {
715 str = file_pheno;
716 if (stat(str.c_str(), &fileInfo) == -1) {
717 cout << "error! fail to open phenotype file: " << str << endl;
718 error = true;
719 }
720 }
721
722 str = file_geno;
723 if (!str.empty() && stat(str.c_str(), &fileInfo) == -1) {
724 cout << "error! fail to open mean genotype file: " << str << endl;
725 error = true;
726 }
727
728 str = file_gene;
729 if (!str.empty() && stat(str.c_str(), &fileInfo) == -1) {
730 cout << "error! fail to open gene expression file: " << str << endl;
731 error = true;
732 }
733
734 str = file_cat;
735 if (!str.empty() && stat(str.c_str(), &fileInfo) == -1) {
736 cout << "error! fail to open category file: " << str << endl;
737 error = true;
738 }
739
740 str = file_mcat;
741 if (!str.empty() && stat(str.c_str(), &fileInfo) == -1) {
742 cout << "error! fail to open mcategory file: " << str << endl;
743 error = true;
744 }
745
746 str = file_beta;
747 if (!str.empty() && stat(str.c_str(), &fileInfo) == -1) {
748 cout << "error! fail to open beta file: " << str << endl;
749 error = true;
750 }
751
752 str = file_cor;
753 if (!str.empty() && stat(str.c_str(), &fileInfo) == -1) {
754 cout << "error! fail to open correlation file: " << str << endl;
755 error = true;
756 }
757
758 if (!file_study.empty()) {
759 str = file_study + ".Vq.txt";
760 if (stat(str.c_str(), &fileInfo) == -1) {
761 cout << "error! fail to open .Vq.txt file: " << str << endl;
762 error = true;
763 }
764 str = file_study + ".q.txt";
765 if (stat(str.c_str(), &fileInfo) == -1) {
766 cout << "error! fail to open .q.txt file: " << str << endl;
767 error = true;
768 }
769 str = file_study + ".size.txt";
770 if (stat(str.c_str(), &fileInfo) == -1) {
771 cout << "error! fail to open .size.txt file: " << str << endl;
772 error = true;
773 }
774 }
775
776 if (!file_ref.empty()) {
777 str = file_ref + ".S.txt";
778 if (stat(str.c_str(), &fileInfo) == -1) {
779 cout << "error! fail to open .S.txt file: " << str << endl;
780 error = true;
781 }
782 str = file_ref + ".size.txt";
783 if (stat(str.c_str(), &fileInfo) == -1) {
784 cout << "error! fail to open .size.txt file: " << str << endl;
785 error = true;
786 }
787 }
788
789 str = file_mstudy;
790 if (!str.empty() && stat(str.c_str(), &fileInfo) == -1) {
791 cout << "error! fail to open mstudy file: " << str << endl;
792 error = true;
793 }
794
795 str = file_mref;
796 if (!str.empty() && stat(str.c_str(), &fileInfo) == -1) {
797 cout << "error! fail to open mref file: " << str << endl;
798 error = true;
799 }
800
801 str = file_mgeno;
802 if (!str.empty() && stat(str.c_str(), &fileInfo) == -1) {
803 cout << "error! fail to open mgeno file: " << str << endl;
804 error = true;
805 }
806
807 str = file_mbfile;
808 if (!str.empty() && stat(str.c_str(), &fileInfo) == -1) {
809 cout << "error! fail to open mbfile file: " << str << endl;
810 error = true;
811 }
812
813 size_t flag = 0;
814 if (!file_bfile.empty()) {
815 flag++;
816 }
817 if (!file_geno.empty()) {
818 flag++;
819 }
820 if (!file_gene.empty()) {
821 flag++;
822 }
823
824 // Always set up random environment.
825 gsl_rng_env_setup(); // sets gsl_rng_default_seed
826 const gsl_rng_type *T = gsl_rng_default; // pick up environment GSL_RNG_SEED
827
828 if (randseed >= 0)
829 gsl_rng_default_seed = randseed; // CLI option used
830 else if (gsl_rng_default_seed == 0) { // by default we will randomize the seed
831 time_t rawtime;
832 time(&rawtime);
833 tm *ptm = gmtime(&rawtime);
834
835 gsl_rng_default_seed =
836 (unsigned)(ptm->tm_hour % 24 * 3600 + ptm->tm_min * 60 + ptm->tm_sec);
837 }
838 gsl_r = gsl_rng_alloc(T);
839
840 if (is_debug_mode()) {
841 printf ("GSL random generator type: %s; ", gsl_rng_name (gsl_r));
842 printf ("seed = %lu (option %li); ", gsl_rng_default_seed, randseed);
843 printf ("first value = %lu\n", gsl_rng_get (gsl_r));
844 }
845
846 if (flag != 1 && a_mode != 15 && a_mode != 27 && a_mode != 28 &&
847 a_mode != 43 && a_mode != 5 && a_mode != 61 && a_mode != 62 &&
848 a_mode != 63 && a_mode != 66 && a_mode != 67) {
849 cout << "error! either plink binary files, or bimbam mean"
850 << "genotype files, or gene expression files are required." << endl;
851 error = true;
852 }
853
854 if (file_pheno.empty() && (a_mode == 43 || a_mode == 5)) {
855 cout << "error! phenotype file is required." << endl;
856 error = true;
857 }
858
859 if (a_mode == 61 || a_mode == 62) {
860 if (!file_beta.empty()) {
861 if (file_mbfile.empty() && file_bfile.empty() && file_mgeno.empty() &&
862 file_geno.empty() && file_mref.empty() && file_ref.empty()) {
863 cout << "error! missing genotype file or ref/mref file." << endl;
864 error = true;
865 }
866 } else if (!file_pheno.empty()) {
867 if (file_kin.empty() && (file_ku.empty() || file_kd.empty()) &&
868 file_mk.empty()) {
869 cout << "error! missing relatedness file. " << endl;
870 error = true;
871 }
872 } else if ((file_mstudy.empty() && file_study.empty()) ||
873 (file_mref.empty() && file_ref.empty())) {
874 cout << "error! either beta file, or phenotype files or "
875 << "study/ref mstudy/mref files are required." << endl;
876 error = true;
877 }
878 }
879
880 if (a_mode == 63) {
881 if (file_kin.empty() && (file_ku.empty() || file_kd.empty()) &&
882 file_mk.empty()) {
883 cout << "error! missing relatedness file. " << endl;
884 error = true;
885 }
886 if (file_pheno.empty()) {
887 cout << "error! missing phenotype file." << endl;
888 error = true;
889 }
890 }
891
892 if (a_mode == 66 || a_mode == 67) {
893 if (file_beta.empty() || (file_mbfile.empty() && file_bfile.empty() &&
894 file_mgeno.empty() && file_geno.empty())) {
895 cout << "error! missing beta file or genotype file." << endl;
896 error = true;
897 }
898 }
899
900 if (!file_epm.empty() && file_bfile.empty() && file_geno.empty()) {
901 cout << "error! estimated parameter file also requires genotype "
902 << "file." << endl;
903 error = true;
904 }
905 if (!file_ebv.empty() && file_kin.empty()) {
906 cout << "error! estimated breeding value file also requires "
907 << "relatedness file." << endl;
908 error = true;
909 }
910
911 if (!file_log.empty() && pheno_mean != 0) {
912 cout << "error! either log file or mu value can be provide." << endl;
913 error = true;
914 }
915
916 enforce_fexists(file_snps, "open file");
917 enforce_fexists(file_ksnps, "open file");
918 enforce_fexists(file_gwasnps, "open file");
919 enforce_fexists(file_anno, "open file");
920
921 if (!loco.empty()) {
922 enforce_msg((a_mode >= 1 && a_mode <= 4) || a_mode == 21 || a_mode == 22,
923 "LOCO only works with LMM and K");
924 // enforce_msg(file_bfile.empty(), "LOCO does not work with PLink (yet)");
925 enforce_msg(file_gxe.empty(), "LOCO does not support GXE (yet)");
926 enforce_msg(!file_anno.empty(),
927 "LOCO requires annotation file (-a switch)");
928 enforce_msg(file_ksnps.empty(), "LOCO does not allow -ksnps switch");
929 enforce_msg(file_gwasnps.empty(), "LOCO does not allow -gwasnps switch");
930 }
931
932 enforce_fexists(file_kin, "open file");
933 enforce_fexists(file_mk, "open file");
934 enforce_fexists(file_cvt, "open file");
935 enforce_fexists(file_gxe, "open file");
936 enforce_fexists(file_log, "open file");
937 enforce_fexists(file_weight, "open file");
938 enforce_fexists(file_epm, "open file");
939 enforce_fexists(file_ebv, "open file");
940 enforce_fexists(file_read, "open file");
941
942 // Check if files are compatible with analysis mode.
943 if (k_mode == 2 && !file_geno.empty()) {
944 cout << "error! use \"-km 1\" when using bimbam mean genotype "
945 << "file. " << endl;
946 error = true;
947 }
948
949 if ((a_mode == 1 || a_mode == 2 || a_mode == 3 || a_mode == 4 ||
950 a_mode == 5 || a_mode == 31) &&
951 (file_kin.empty() && (file_ku.empty() || file_kd.empty()))) {
952 cout << "error! missing relatedness file. " << endl;
953 error = true;
954 }
955
956 if ((a_mode == 43) && file_kin.empty()) {
957 cout << "error! missing relatedness file. -predict option requires "
958 << "-k option to provide a relatedness file." << endl;
959 error = true;
960 }
961
962 if ((a_mode == 11 || a_mode == 12 || a_mode == 13 || a_mode == 14 ||
963 a_mode == 16) &&
964 !file_cvt.empty()) {
965 cout << "error! -bslmm option does not support covariates files." << endl;
966 error = true;
967 }
968
969 if (a_mode == 41 || a_mode == 42) {
970 if (!file_cvt.empty()) {
971 cout << "error! -predict option does not support "
972 << "covariates files." << endl;
973 error = true;
974 }
975 if (file_epm.empty()) {
976 cout << "error! -predict option requires estimated "
977 << "parameter files." << endl;
978 error = true;
979 }
980 }
981
982 if (file_beta.empty() && (a_mode == 27 || a_mode == 28)) {
983 cout << "error! beta effects file is required." << endl;
984 error = true;
985 }
986
987 return;
988 }
989
CheckData(void)990 void PARAM::CheckData(void) {
991
992 if ((a_mode == 66 || a_mode == 67) && (v_pve.size() != n_vc)) {
993 cout << "error! the number of pve estimates does not equal to "
994 << "the number of categories in the cat file:" << v_pve.size() << " "
995 << n_vc << endl;
996 error = true;
997 }
998
999 if ((indicator_cvt).size() != 0 &&
1000 (indicator_cvt).size() != (indicator_idv).size()) {
1001 error = true;
1002 cout << "error! number of rows in the covariates file do not "
1003 << "match the number of individuals. " << indicator_cvt.size() << endl;
1004 return;
1005 }
1006 if ((indicator_gxe).size() != 0 &&
1007 (indicator_gxe).size() != (indicator_idv).size()) {
1008 error = true;
1009 cout << "error! number of rows in the gxe file do not match the number "
1010 << "of individuals. " << endl;
1011 return;
1012 }
1013 if ((indicator_weight).size() != 0 &&
1014 (indicator_weight).size() != (indicator_idv).size()) {
1015 error = true;
1016 cout << "error! number of rows in the weight file do not match "
1017 << "the number of individuals. " << endl;
1018 return;
1019 }
1020
1021 if ((indicator_read).size() != 0 &&
1022 (indicator_read).size() != (indicator_idv).size()) {
1023 error = true;
1024 cout << "error! number of rows in the total read file do not "
1025 << "match the number of individuals. " << endl;
1026 return;
1027 }
1028
1029 // Calculate ni_total and ni_test, and set indicator_idv to 0
1030 // whenever indicator_cvt=0, and calculate np_obs and np_miss.
1031 ni_total = (indicator_idv).size();
1032
1033 ni_test = 0;
1034 for (vector<int>::size_type i = 0; i < (indicator_idv).size(); ++i) {
1035 if (indicator_idv[i] == 0) {
1036 continue;
1037 }
1038 ni_test++;
1039 }
1040
1041 ni_cvt = 0;
1042 for (size_t i = 0; i < indicator_cvt.size(); i++) {
1043 if (indicator_cvt[i] == 0) {
1044 continue;
1045 }
1046 ni_cvt++;
1047 }
1048
1049 np_obs = 0;
1050 np_miss = 0;
1051 for (size_t i = 0; i < indicator_pheno.size(); i++) {
1052 if (indicator_cvt.size() != 0) {
1053 if (indicator_cvt[i] == 0) {
1054 continue;
1055 }
1056 }
1057
1058 if (indicator_gxe.size() != 0) {
1059 if (indicator_gxe[i] == 0) {
1060 continue;
1061 }
1062 }
1063
1064 if (indicator_weight.size() != 0) {
1065 if (indicator_weight[i] == 0) {
1066 continue;
1067 }
1068 }
1069
1070 for (size_t j = 0; j < indicator_pheno[i].size(); j++) {
1071 if (indicator_pheno[i][j] == 0) {
1072 np_miss++;
1073 } else {
1074 np_obs++;
1075 }
1076 }
1077 }
1078
1079 enforce_msg(ni_test,"number of analyzed individuals equals 0.");
1080
1081 if (ni_test == 0 && file_cor.empty() && file_mstudy.empty() &&
1082 file_study.empty() && file_beta.empty() && file_bf.empty()) {
1083 error = true;
1084 cout << "error! number of analyzed individuals equals 0. " << endl;
1085 return;
1086 }
1087
1088 if (a_mode == 43) {
1089 if (ni_cvt == ni_test) {
1090 error = true;
1091 cout << "error! no individual has missing "
1092 << "phenotypes." << endl;
1093 return;
1094 }
1095 if ((np_obs + np_miss) != (ni_cvt * n_ph)) {
1096 error = true;
1097 cout << "error! number of phenotypes do not match the "
1098 << "summation of missing and observed phenotypes." << endl;
1099 return;
1100 }
1101 }
1102
1103 // Output some information.
1104 if (file_cor.empty() && file_mstudy.empty() && file_study.empty() &&
1105 a_mode != 15 && a_mode != 27 && a_mode != 28) {
1106 cout << "## number of total individuals = " << ni_total << endl;
1107 if (a_mode == 43) {
1108 cout << "## number of analyzed individuals = " << ni_cvt << endl;
1109 cout << "## number of individuals with full phenotypes = " << ni_test
1110 << endl;
1111 } else {
1112 cout << "## number of analyzed individuals = " << ni_test << endl;
1113 }
1114 cout << "## number of covariates = " << n_cvt << endl;
1115 cout << "## number of phenotypes = " << n_ph << endl;
1116 if (a_mode == 43) {
1117 cout << "## number of observed data = " << np_obs << endl;
1118 cout << "## number of missing data = " << np_miss << endl;
1119 }
1120 if (!file_gene.empty()) {
1121 cout << "## number of total genes = " << ng_total << endl;
1122 } else if (file_epm.empty() && a_mode != 43 && a_mode != 5) {
1123 if (!loco.empty())
1124 cout << "## leave one chromosome out (LOCO) = " << setw(8) << loco << endl;
1125 cout << "## number of total SNPs/var = " << setw(8) << ns_total << endl;
1126 if (setSnps.size())
1127 cout << "## number of considered SNPS = " << setw(8) << setSnps.size() << endl;
1128 if (setKSnps.size())
1129 cout << "## number of SNPS for K = " << setw(8) << setKSnps.size() << endl;
1130 if (setGWASnps.size())
1131 cout << "## number of SNPS for GWAS = " << setw(8) << setGWASnps.size() << endl;
1132 cout << "## number of analyzed SNPs = " << setw(8) << ns_test << endl;
1133 } else {
1134 }
1135 }
1136
1137 // Set d_pace to 1000 for gene expression.
1138 if (!file_gene.empty() && d_pace == DEFAULT_PACE) {
1139 d_pace = 1000;
1140 }
1141
1142 // For case-control studies, count # cases and # controls.
1143 int flag_cc = 0;
1144 if (a_mode == 13) {
1145 ni_case = 0;
1146 ni_control = 0;
1147 for (size_t i = 0; i < indicator_idv.size(); i++) {
1148 if (indicator_idv[i] == 0) {
1149 continue;
1150 }
1151
1152 if (pheno[i][0] == 0) {
1153 ni_control++;
1154 } else if (pheno[i][0] == 1) {
1155 ni_case++;
1156 } else {
1157 flag_cc = 1;
1158 }
1159 }
1160 cout << "## number of cases = " << ni_case << endl;
1161 cout << "## number of controls = " << ni_control << endl;
1162 }
1163
1164 if (flag_cc == 1) {
1165 cout << "Unexpected non-binary phenotypes for "
1166 << "case/control analysis. Use default (BSLMM) analysis instead."
1167 << endl;
1168 a_mode = 11;
1169 }
1170
1171 // Set parameters for BSLMM and check for predict.
1172 if (a_mode == 11 || a_mode == 12 || a_mode == 13 || a_mode == 14) {
1173 if (a_mode == 11) {
1174 n_mh = 1;
1175 }
1176 if (logp_min == 0) {
1177 logp_min = -1.0 * log((double)ns_test);
1178 }
1179
1180 if (h_scale == -1) {
1181 h_scale = min(1.0, 10.0 / sqrt((double)ni_test));
1182 }
1183 if (rho_scale == -1) {
1184 rho_scale = min(1.0, 10.0 / sqrt((double)ni_test));
1185 }
1186 if (logp_scale == -1) {
1187 logp_scale = min(1.0, 5.0 / sqrt((double)ni_test));
1188 }
1189
1190 if (h_min == -1) {
1191 h_min = 0.0;
1192 }
1193 if (h_max == -1) {
1194 h_max = 1.0;
1195 }
1196
1197 if (s_max > ns_test) {
1198 s_max = ns_test;
1199 cout << "s_max is re-set to the number of analyzed SNPs." << endl;
1200 }
1201 if (s_max < s_min) {
1202 cout << "error! maximum s value must be larger than the "
1203 << "minimal value. current values = " << s_max << " and " << s_min
1204 << endl;
1205 error = true;
1206 }
1207 } else if (a_mode == 41 || a_mode == 42) {
1208 if (indicator_bv.size() != 0) {
1209 if (indicator_idv.size() != indicator_bv.size()) {
1210 cout << "error! number of rows in the "
1211 << "phenotype file does not match that in the "
1212 << "estimated breeding value file: " << indicator_idv.size()
1213 << "\t" << indicator_bv.size() << endl;
1214 error = true;
1215 } else {
1216 size_t flag_bv = 0;
1217 for (size_t i = 0; i < (indicator_bv).size(); ++i) {
1218 if (indicator_idv[i] != indicator_bv[i]) {
1219 flag_bv++;
1220 }
1221 }
1222 if (flag_bv != 0) {
1223 cout << "error! individuals with missing value in the "
1224 << "phenotype file does not match that in the "
1225 << "estimated breeding value file: " << flag_bv << endl;
1226 error = true;
1227 }
1228 }
1229 }
1230 }
1231
1232 if (a_mode == 62 && !file_beta.empty() && mapRS2wcat.size() == 0) {
1233 cout << "vc analysis with beta files requires -wcat file." << endl;
1234 error = true;
1235 }
1236 if (a_mode == 67 && mapRS2wcat.size() == 0) {
1237 cout << "ci analysis with beta files requires -wcat file." << endl;
1238 error = true;
1239 }
1240
1241 // File_mk needs to contain more than one line.
1242 if (n_vc == 1 && !file_mk.empty()) {
1243 cout << "error! -mk file should contain more than one line." << endl;
1244 error = true;
1245 }
1246
1247 return;
1248 }
1249
PrintSummary()1250 void PARAM::PrintSummary() {
1251 if (n_ph == 1) {
1252 cout << "pve estimate =" << pve_null << endl;
1253 cout << "se(pve) =" << pve_se_null << endl;
1254 } else {
1255 }
1256 return;
1257 }
1258
ReadGenotypes(gsl_matrix * UtX,gsl_matrix * K,const bool calc_K)1259 void PARAM::ReadGenotypes(gsl_matrix *UtX, gsl_matrix *K, const bool calc_K) {
1260 string file_str;
1261
1262 if (!file_bfile.empty()) {
1263 file_str = file_bfile + ".bed";
1264 if (ReadFile_bed(file_str, indicator_idv, indicator_snp, UtX, K, calc_K) ==
1265 false) {
1266 error = true;
1267 }
1268 } else {
1269 if (ReadFile_geno(file_geno, indicator_idv, indicator_snp, UtX, K,
1270 calc_K) == false) {
1271 error = true;
1272 }
1273 }
1274
1275 return;
1276 }
1277
ReadGenotypes(vector<vector<unsigned char>> & Xt,gsl_matrix * K,const bool calc_K)1278 void PARAM::ReadGenotypes(vector<vector<unsigned char>> &Xt, gsl_matrix *K,
1279 const bool calc_K) {
1280 string file_str;
1281
1282 if (!file_bfile.empty()) {
1283 file_str = file_bfile + ".bed";
1284 if (ReadFile_bed(file_str, indicator_idv, indicator_snp, Xt, K, calc_K,
1285 ni_test, ns_test) == false) {
1286 error = true;
1287 }
1288 } else {
1289 if (ReadFile_geno(file_geno, indicator_idv, indicator_snp, Xt, K, calc_K,
1290 ni_test, ns_test) == false) {
1291 error = true;
1292 }
1293 }
1294
1295 return;
1296 }
1297
CalcKin(gsl_matrix * matrix_kin)1298 void PARAM::CalcKin(gsl_matrix *matrix_kin) {
1299 string file_str;
1300
1301 gsl_matrix_set_zero(matrix_kin);
1302
1303 if (!file_bfile.empty()) {
1304 file_str = file_bfile + ".bed";
1305 // enforce_msg(loco.empty(), "FIXME: LOCO nyi");
1306 if (PlinkKin(file_str, indicator_snp, a_mode - 20, d_pace, matrix_kin) ==
1307 false) {
1308 error = true;
1309 }
1310 } else {
1311 file_str = file_geno;
1312 if (BimbamKin(file_str, setKSnps, indicator_snp, a_mode - 20, d_pace,
1313 matrix_kin, ni_max == 0) == false) {
1314 error = true;
1315 }
1316 }
1317
1318 return;
1319 }
1320
1321 // From an existing n by nd A and K matrices, compute the d by d S
1322 // matrix (which is not necessary symmetric).
compAKtoS(const gsl_matrix * A,const gsl_matrix * K,const size_t n_cvt,gsl_matrix * S)1323 void compAKtoS(const gsl_matrix *A, const gsl_matrix *K, const size_t n_cvt,
1324 gsl_matrix *S) {
1325 size_t n_vc = S->size1, ni_test = A->size1;
1326 double di, dj, tr_AK, sum_A, sum_K, s_A, s_K, sum_AK, tr_A, tr_K, d;
1327
1328 for (size_t i = 0; i < n_vc; i++) {
1329 for (size_t j = 0; j < n_vc; j++) {
1330 tr_AK = 0;
1331 sum_A = 0;
1332 sum_K = 0;
1333 sum_AK = 0;
1334 tr_A = 0;
1335 tr_K = 0;
1336 for (size_t l = 0; l < ni_test; l++) {
1337 s_A = 0;
1338 s_K = 0;
1339 for (size_t k = 0; k < ni_test; k++) {
1340 di = gsl_matrix_get(A, l, k + ni_test * i);
1341 dj = gsl_matrix_get(K, l, k + ni_test * j);
1342 s_A += di;
1343 s_K += dj;
1344
1345 tr_AK += di * dj;
1346 sum_A += di;
1347 sum_K += dj;
1348 if (l == k) {
1349 tr_A += di;
1350 tr_K += dj;
1351 }
1352 }
1353 sum_AK += s_A * s_K;
1354 }
1355
1356 sum_A /= (double)ni_test;
1357 sum_K /= (double)ni_test;
1358 sum_AK /= (double)ni_test;
1359 tr_A -= sum_A;
1360 tr_K -= sum_K;
1361 d = tr_AK - 2 * sum_AK + sum_A * sum_K;
1362
1363 if (tr_A == 0 || tr_K == 0) {
1364 d = 0;
1365 } else {
1366 assert((tr_A * tr_K) - 1 != 0);
1367 assert(ni_test - n_cvt != 0);
1368 d = d / (tr_A * tr_K) - 1 / (double)(ni_test - n_cvt);
1369 }
1370
1371 gsl_matrix_set(S, i, j, d);
1372 }
1373 }
1374
1375 return;
1376 }
1377
1378 /*
1379 // Copied from lmm.cpp; is used in the following function compKtoV
1380 // map a number 1..(n_cvt+2) to an index between 0 and [(n_cvt+2)*2+(n_cvt+2)]/2-1
1381 // or 1..cols to 0..(cols*2+cols)/2-1.
1382
1383 For a 3x3 matrix the following index gets returned to CalcPab:
1384
1385 CalcPab
1386 * 1,1:0
1387 * 1,2:1
1388 * 1,3:2
1389 * 2,2:5
1390 * 2,3:6
1391 * 3,3:9
1392
1393 which is really the iteration moving forward along the diagonal and
1394 items to the right of it.
1395 */
1396
1397
GetabIndex(const size_t a,const size_t b,const size_t n_cvt)1398 size_t GetabIndex(const size_t a, const size_t b, const size_t n_cvt) {
1399 auto cols = n_cvt + 2;
1400 enforce_msg(a<=cols && b<=cols,"GetabIndex problem");
1401 size_t a1 = a, b1 = b;
1402 if (b <= a) {
1403 a1 = b;
1404 b1 = a;
1405 }
1406
1407 size_t index = (2 * cols - a1 + 2) * (a1 - 1) / 2 + b1 - a1;
1408 // cout << "* GetabIndx " << a1 << "," << b1 << "," << cols << ":" << index << endl;
1409 // FIXME: should add a contract for index range
1410 return index;
1411 // return ( b < a ? ((2 * cols - b + 2) * (b - 1) / 2 + a - b ): ((2 * cols - a + 2) * (a - 1) / 2 + b - a) );
1412
1413 }
1414
1415 // From an existing n by nd (centered) G matrix, compute the d+1 by
1416 // d*(d-1)/2*(d+1) Q matrix where inside i'th d+1 by d+1 matrix, each
1417 // element is tr(KiKlKjKm)-r*tr(KmKiKl)-r*tr(KlKjKm)+r^2*tr(KlKm),
1418 // where r=n/(n-1)
compKtoV(const gsl_matrix * G,gsl_matrix * V)1419 void compKtoV(const gsl_matrix *G, gsl_matrix *V) {
1420 size_t n_vc = G->size2 / G->size1, ni_test = G->size1;
1421
1422 gsl_matrix *KiKj =
1423 gsl_matrix_alloc(ni_test, (n_vc * (n_vc + 1)) / 2 * ni_test);
1424 gsl_vector *trKiKj = gsl_vector_alloc(n_vc * (n_vc + 1) / 2);
1425 gsl_vector *trKi = gsl_vector_alloc(n_vc);
1426
1427 assert(ni_test != 1);
1428 double d, tr, r = (double)ni_test / (double)(ni_test - 1);
1429 size_t t, t_il, t_jm, t_lm, t_im, t_jl, t_ij;
1430
1431 // Compute KiKj for all pairs of i and j (not including the identity
1432 // matrix).
1433 t = 0;
1434 for (size_t i = 0; i < n_vc; i++) {
1435 gsl_matrix_const_view Ki =
1436 gsl_matrix_const_submatrix(G, 0, i * ni_test, ni_test, ni_test);
1437 for (size_t j = i; j < n_vc; j++) {
1438 gsl_matrix_const_view Kj =
1439 gsl_matrix_const_submatrix(G, 0, j * ni_test, ni_test, ni_test);
1440 gsl_matrix_view KiKj_sub =
1441 gsl_matrix_submatrix(KiKj, 0, t * ni_test, ni_test, ni_test);
1442 fast_dgemm("N", "N", 1.0, &Ki.matrix, &Kj.matrix, 0.0,
1443 &KiKj_sub.matrix);
1444 t++;
1445 }
1446 }
1447
1448 // Compute trKi, trKiKj.
1449 t = 0;
1450 for (size_t i = 0; i < n_vc; i++) {
1451 for (size_t j = i; j < n_vc; j++) {
1452 tr = 0;
1453 for (size_t k = 0; k < ni_test; k++) {
1454 tr += gsl_matrix_get(KiKj, k, t * ni_test + k);
1455 }
1456 gsl_vector_set(trKiKj, t, tr);
1457
1458 t++;
1459 }
1460
1461 tr = 0;
1462 for (size_t k = 0; k < ni_test; k++) {
1463 tr += gsl_matrix_get(G, k, i * ni_test + k);
1464 }
1465 gsl_vector_set(trKi, i, tr);
1466 }
1467
1468 // Compute V.
1469 for (size_t i = 0; i < n_vc; i++) {
1470 for (size_t j = i; j < n_vc; j++) {
1471 t_ij = GetabIndex(i + 1, j + 1, n_vc - 2);
1472 for (size_t l = 0; l < n_vc + 1; l++) {
1473 for (size_t m = 0; m < n_vc + 1; m++) {
1474 if (l != n_vc && m != n_vc) {
1475 t_il = GetabIndex(i + 1, l + 1, n_vc - 2);
1476 t_jm = GetabIndex(j + 1, m + 1, n_vc - 2);
1477 t_lm = GetabIndex(l + 1, m + 1, n_vc - 2);
1478 tr = 0;
1479 for (size_t k = 0; k < ni_test; k++) {
1480 gsl_vector_const_view KiKl_row =
1481 gsl_matrix_const_subrow(KiKj, k, t_il * ni_test, ni_test);
1482 gsl_vector_const_view KiKl_col =
1483 gsl_matrix_const_column(KiKj, t_il * ni_test + k);
1484 gsl_vector_const_view KjKm_row =
1485 gsl_matrix_const_subrow(KiKj, k, t_jm * ni_test, ni_test);
1486 gsl_vector_const_view KjKm_col =
1487 gsl_matrix_const_column(KiKj, t_jm * ni_test + k);
1488
1489 gsl_vector_const_view Kl_row =
1490 gsl_matrix_const_subrow(G, k, l * ni_test, ni_test);
1491 gsl_vector_const_view Km_row =
1492 gsl_matrix_const_subrow(G, k, m * ni_test, ni_test);
1493
1494 if (i <= l && j <= m) {
1495 gsl_blas_ddot(&KiKl_row.vector, &KjKm_col.vector, &d);
1496 tr += d;
1497 gsl_blas_ddot(&Km_row.vector, &KiKl_col.vector, &d);
1498 tr -= r * d;
1499 gsl_blas_ddot(&Kl_row.vector, &KjKm_col.vector, &d);
1500 tr -= r * d;
1501 } else if (i <= l && j > m) {
1502 gsl_blas_ddot(&KiKl_row.vector, &KjKm_row.vector, &d);
1503 tr += d;
1504 gsl_blas_ddot(&Km_row.vector, &KiKl_col.vector, &d);
1505 tr -= r * d;
1506 gsl_blas_ddot(&Kl_row.vector, &KjKm_row.vector, &d);
1507 tr -= r * d;
1508 } else if (i > l && j <= m) {
1509 gsl_blas_ddot(&KiKl_col.vector, &KjKm_col.vector, &d);
1510 tr += d;
1511 gsl_blas_ddot(&Km_row.vector, &KiKl_row.vector, &d);
1512 tr -= r * d;
1513 gsl_blas_ddot(&Kl_row.vector, &KjKm_col.vector, &d);
1514 tr -= r * d;
1515 } else {
1516 gsl_blas_ddot(&KiKl_col.vector, &KjKm_row.vector, &d);
1517 tr += d;
1518 gsl_blas_ddot(&Km_row.vector, &KiKl_row.vector, &d);
1519 tr -= r * d;
1520 gsl_blas_ddot(&Kl_row.vector, &KjKm_row.vector, &d);
1521 tr -= r * d;
1522 }
1523 }
1524
1525 tr += r * r * gsl_vector_get(trKiKj, t_lm);
1526 } else if (l != n_vc && m == n_vc) {
1527 t_il = GetabIndex(i + 1, l + 1, n_vc - 2);
1528 t_jl = GetabIndex(j + 1, l + 1, n_vc - 2);
1529 tr = 0;
1530 for (size_t k = 0; k < ni_test; k++) {
1531 gsl_vector_const_view KiKl_row =
1532 gsl_matrix_const_subrow(KiKj, k, t_il * ni_test, ni_test);
1533 gsl_vector_const_view KiKl_col =
1534 gsl_matrix_const_column(KiKj, t_il * ni_test + k);
1535 gsl_vector_const_view Kj_row =
1536 gsl_matrix_const_subrow(G, k, j * ni_test, ni_test);
1537
1538 if (i <= l) {
1539 gsl_blas_ddot(&KiKl_row.vector, &Kj_row.vector, &d);
1540 tr += d;
1541 } else {
1542 gsl_blas_ddot(&KiKl_col.vector, &Kj_row.vector, &d);
1543 tr += d;
1544 }
1545 }
1546 tr += -r * gsl_vector_get(trKiKj, t_il) -
1547 r * gsl_vector_get(trKiKj, t_jl) +
1548 r * r * gsl_vector_get(trKi, l);
1549 } else if (l == n_vc && m != n_vc) {
1550 t_jm = GetabIndex(j + 1, m + 1, n_vc - 2);
1551 t_im = GetabIndex(i + 1, m + 1, n_vc - 2);
1552 tr = 0;
1553 for (size_t k = 0; k < ni_test; k++) {
1554 gsl_vector_const_view KjKm_row =
1555 gsl_matrix_const_subrow(KiKj, k, t_jm * ni_test, ni_test);
1556 gsl_vector_const_view KjKm_col =
1557 gsl_matrix_const_column(KiKj, t_jm * ni_test + k);
1558 gsl_vector_const_view Ki_row =
1559 gsl_matrix_const_subrow(G, k, i * ni_test, ni_test);
1560
1561 if (j <= m) {
1562 gsl_blas_ddot(&KjKm_row.vector, &Ki_row.vector, &d);
1563 tr += d;
1564 } else {
1565 gsl_blas_ddot(&KjKm_col.vector, &Ki_row.vector, &d);
1566 tr += d;
1567 }
1568 }
1569 tr += -r * gsl_vector_get(trKiKj, t_im) -
1570 r * gsl_vector_get(trKiKj, t_jm) +
1571 r * r * gsl_vector_get(trKi, m);
1572 } else {
1573 tr = gsl_vector_get(trKiKj, t_ij) - r * gsl_vector_get(trKi, i) -
1574 r * gsl_vector_get(trKi, j) + r * r * (double)(ni_test - 1);
1575 }
1576
1577 gsl_matrix_set(V, l, t_ij * (n_vc + 1) + m, tr);
1578 }
1579 }
1580 }
1581 }
1582
1583 assert(ni_test != 0);
1584 gsl_matrix_scale(V, 1.0 / pow((double)ni_test, 2));
1585
1586 gsl_matrix_free(KiKj);
1587 gsl_vector_free(trKiKj);
1588 gsl_vector_free(trKi);
1589
1590 return;
1591 }
1592
1593 // Perform Jacknife sampling for variance of S.
JackknifeAKtoS(const gsl_matrix * W,const gsl_matrix * A,const gsl_matrix * K,gsl_matrix * S,gsl_matrix * Svar)1594 void JackknifeAKtoS(const gsl_matrix *W, const gsl_matrix *A,
1595 const gsl_matrix *K, gsl_matrix *S, gsl_matrix *Svar) {
1596 size_t n_vc = Svar->size1, ni_test = A->size1, n_cvt = W->size2;
1597
1598 vector<vector<vector<double>>> trAK, sumAK;
1599 vector<vector<double>> sumA, sumK, trA, trK, sA, sK;
1600 vector<double> vec_tmp;
1601 double di, dj, d, m, v;
1602
1603 // Initialize and set all elements to zero.
1604 for (size_t i = 0; i < ni_test; i++) {
1605 vec_tmp.push_back(0);
1606 }
1607
1608 for (size_t i = 0; i < n_vc; i++) {
1609 sumA.push_back(vec_tmp);
1610 sumK.push_back(vec_tmp);
1611 trA.push_back(vec_tmp);
1612 trK.push_back(vec_tmp);
1613 sA.push_back(vec_tmp);
1614 sK.push_back(vec_tmp);
1615 }
1616
1617 for (size_t i = 0; i < n_vc; i++) {
1618 trAK.push_back(sumK);
1619 sumAK.push_back(sumK);
1620 }
1621
1622 // Run jackknife.
1623 for (size_t i = 0; i < n_vc; i++) {
1624 for (size_t l = 0; l < ni_test; l++) {
1625 for (size_t k = 0; k < ni_test; k++) {
1626 di = gsl_matrix_get(A, l, k + ni_test * i);
1627 dj = gsl_matrix_get(K, l, k + ni_test * i);
1628
1629 for (size_t t = 0; t < ni_test; t++) {
1630 if (t == l || t == k) {
1631 continue;
1632 }
1633 sumA[i][t] += di;
1634 sumK[i][t] += dj;
1635 if (l == k) {
1636 trA[i][t] += di;
1637 trK[i][t] += dj;
1638 }
1639 }
1640 sA[i][l] += di;
1641 sK[i][l] += dj;
1642 }
1643 }
1644
1645 for (size_t t = 0; t < ni_test; t++) {
1646 assert(ni_test != 1);
1647 sumA[i][t] /= (double)(ni_test - 1);
1648 sumK[i][t] /= (double)(ni_test - 1);
1649 }
1650 }
1651
1652 for (size_t i = 0; i < n_vc; i++) {
1653 for (size_t j = 0; j < n_vc; j++) {
1654 for (size_t l = 0; l < ni_test; l++) {
1655 for (size_t k = 0; k < ni_test; k++) {
1656 di = gsl_matrix_get(A, l, k + ni_test * i);
1657 dj = gsl_matrix_get(K, l, k + ni_test * j);
1658 d = di * dj;
1659
1660 for (size_t t = 0; t < ni_test; t++) {
1661 if (t == l || t == k) {
1662 continue;
1663 }
1664 trAK[i][j][t] += d;
1665 }
1666 }
1667
1668 for (size_t t = 0; t < ni_test; t++) {
1669 if (t == l) {
1670 continue;
1671 }
1672 di = gsl_matrix_get(A, l, t + ni_test * i);
1673 dj = gsl_matrix_get(K, l, t + ni_test * j);
1674
1675 sumAK[i][j][t] += (sA[i][l] - di) * (sK[j][l] - dj);
1676 }
1677 }
1678
1679 for (size_t t = 0; t < ni_test; t++) {
1680 assert(ni_test != 1);
1681 sumAK[i][j][t] /= (double)(ni_test - 1);
1682 }
1683
1684 m = 0;
1685 v = 0;
1686 for (size_t t = 0; t < ni_test; t++) {
1687 d = trAK[i][j][t] - 2 * sumAK[i][j][t] + sumA[i][t] * sumK[j][t];
1688 if ((trA[i][t] - sumA[i][t]) == 0 || (trK[j][t] - sumK[j][t]) == 0) {
1689 d = 0;
1690 } else {
1691 d /= (trA[i][t] - sumA[i][t]) * (trK[j][t] - sumK[j][t]);
1692 d -= 1 / (double)(ni_test - n_cvt - 1);
1693 }
1694 m += d;
1695 v += d * d;
1696 }
1697 m /= (double)ni_test;
1698 v /= (double)ni_test;
1699 v -= m * m;
1700 v *= (double)(ni_test - 1);
1701 gsl_matrix_set(Svar, i, j, v);
1702 if (n_cvt == 1) {
1703 d = gsl_matrix_get(S, i, j);
1704 d = (double)ni_test * d - (double)(ni_test - 1) * m;
1705 gsl_matrix_set(S, i, j, d);
1706 }
1707 }
1708 }
1709
1710 return;
1711 }
1712
1713 // Compute the d by d S matrix with its d by d variance matrix of
1714 // Svar, and the d+1 by d(d+1) matrix of Q for V(q).
CalcS(const map<string,double> & mapRS2wA,const map<string,double> & mapRS2wK,const gsl_matrix * W,gsl_matrix * A,gsl_matrix * K,gsl_matrix * S,gsl_matrix * Svar,gsl_vector * ns)1715 void PARAM::CalcS(const map<string, double> &mapRS2wA,
1716 const map<string, double> &mapRS2wK, const gsl_matrix *W,
1717 gsl_matrix *A, gsl_matrix *K, gsl_matrix *S, gsl_matrix *Svar,
1718 gsl_vector *ns) {
1719 string file_str;
1720
1721 gsl_matrix_set_zero(S);
1722 gsl_matrix_set_zero(Svar);
1723 gsl_vector_set_zero(ns);
1724
1725 // Compute the kinship matrix G for multiple categories; these
1726 // matrices are not centered, for convienence of Jacknife sampling.
1727 if (!file_bfile.empty()) {
1728 file_str = file_bfile + ".bed";
1729 if (mapRS2wA.size() == 0) {
1730 if (PlinkKin(file_str, d_pace, indicator_idv, indicator_snp, mapRS2wK,
1731 mapRS2cat, snpInfo, W, K, ns) == false) {
1732 error = true;
1733 }
1734 } else {
1735 if (PlinkKin(file_str, d_pace, indicator_idv, indicator_snp, mapRS2wA,
1736 mapRS2cat, snpInfo, W, A, ns) == false) {
1737 error = true;
1738 }
1739 }
1740 } else if (!file_geno.empty()) {
1741 file_str = file_geno;
1742 if (mapRS2wA.size() == 0) {
1743 if (BimbamKinUncentered(file_str, setKSnps, d_pace, indicator_idv,
1744 indicator_snp, mapRS2wK, mapRS2cat, snpInfo, W, K,
1745 ns) == false) {
1746 error = true;
1747 }
1748 } else {
1749 if (BimbamKinUncentered(file_str, setKSnps, d_pace, indicator_idv,
1750 indicator_snp, mapRS2wA, mapRS2cat, snpInfo, W, A,
1751 ns) == false) {
1752 error = true;
1753 }
1754 }
1755 } else if (!file_mbfile.empty()) {
1756 if (mapRS2wA.size() == 0) {
1757 if (MFILEKin(1, file_mbfile, setKSnps, d_pace, indicator_idv,
1758 mindicator_snp, mapRS2wK, mapRS2cat, msnpInfo, W, K,
1759 ns) == false) {
1760 error = true;
1761 }
1762 } else {
1763 if (MFILEKin(1, file_mbfile, setKSnps, d_pace, indicator_idv,
1764 mindicator_snp, mapRS2wA, mapRS2cat, msnpInfo, W, A,
1765 ns) == false) {
1766 error = true;
1767 }
1768 }
1769 } else if (!file_mgeno.empty()) {
1770 if (mapRS2wA.size() == 0) {
1771 if (MFILEKin(0, file_mgeno, setKSnps, d_pace, indicator_idv,
1772 mindicator_snp, mapRS2wK, mapRS2cat, msnpInfo, W, K,
1773 ns) == false) {
1774 error = true;
1775 }
1776 } else {
1777 if (MFILEKin(0, file_mgeno, setKSnps, d_pace, indicator_idv,
1778 mindicator_snp, mapRS2wA, mapRS2cat, msnpInfo, W, A,
1779 ns) == false) {
1780 error = true;
1781 }
1782 }
1783 }
1784
1785 if (mapRS2wA.size() == 0) {
1786 gsl_matrix_memcpy(A, K);
1787 }
1788
1789 // Center and scale every kinship matrix inside G.
1790 for (size_t i = 0; i < n_vc; i++) {
1791 gsl_matrix_view Ksub =
1792 gsl_matrix_submatrix(K, 0, i * ni_test, ni_test, ni_test);
1793 CenterMatrix(&Ksub.matrix);
1794 // Scale the matrix G such that the mean diagonal = 1.
1795 ScaleMatrix(&Ksub.matrix);
1796
1797 gsl_matrix_view Asub =
1798 gsl_matrix_submatrix(A, 0, i * ni_test, ni_test, ni_test);
1799 CenterMatrix(&Asub.matrix);
1800 ScaleMatrix(&Asub.matrix);
1801 }
1802
1803 // Cased on G, compute S.
1804 compAKtoS(A, K, W->size2, S);
1805
1806 // Compute Svar and update S with Jacknife.
1807 JackknifeAKtoS(W, A, K, S, Svar);
1808
1809 return;
1810 }
1811
WriteVector(const gsl_vector * q,const gsl_vector * s,const size_t n_total,const string suffix)1812 void PARAM::WriteVector(const gsl_vector *q, const gsl_vector *s,
1813 const size_t n_total, const string suffix) {
1814 string file_str;
1815 file_str = path_out + "/" + file_out;
1816 file_str += ".";
1817 file_str += suffix;
1818 file_str += ".txt";
1819
1820 ofstream outfile(file_str.c_str(), ofstream::out);
1821 if (!outfile) {
1822 cout << "error writing file: " << file_str.c_str() << endl;
1823 return;
1824 }
1825
1826 outfile.precision(10);
1827
1828 for (size_t i = 0; i < q->size; ++i) {
1829 outfile << gsl_vector_get(q, i) << endl;
1830 }
1831
1832 for (size_t i = 0; i < s->size; ++i) {
1833 outfile << gsl_vector_get(s, i) << endl;
1834 }
1835
1836 outfile << n_total << endl;
1837
1838 outfile.close();
1839 outfile.clear();
1840 return;
1841 }
1842
WriteVar(const string suffix)1843 void PARAM::WriteVar(const string suffix) {
1844 string file_str, rs;
1845 file_str = path_out + "/" + file_out;
1846 file_str += ".";
1847 file_str += suffix;
1848 file_str += ".txt.gz";
1849
1850 ogzstream outfile(file_str.c_str(), ogzstream::out);
1851 if (!outfile) {
1852 cout << "error writing file: " << file_str.c_str() << endl;
1853 return;
1854 }
1855
1856 outfile.precision(10);
1857
1858 if (mindicator_snp.size() != 0) {
1859 for (size_t t = 0; t < mindicator_snp.size(); t++) {
1860 indicator_snp = mindicator_snp[t];
1861 for (size_t i = 0; i < indicator_snp.size(); i++) {
1862 if (indicator_snp[i] == 0) {
1863 continue;
1864 }
1865 rs = snpInfo[i].rs_number;
1866 outfile << rs << endl;
1867 }
1868 }
1869 } else {
1870 for (size_t i = 0; i < indicator_snp.size(); i++) {
1871 if (indicator_snp[i] == 0) {
1872 continue;
1873 }
1874 rs = snpInfo[i].rs_number;
1875 outfile << rs << endl;
1876 }
1877 }
1878
1879 outfile.close();
1880 outfile.clear();
1881 return;
1882 }
1883
WriteMatrix(const gsl_matrix * matrix_U,const string suffix)1884 void PARAM::WriteMatrix(const gsl_matrix *matrix_U, const string suffix) {
1885 string file_str;
1886 file_str = path_out + "/" + file_out;
1887 file_str += ".";
1888 file_str += suffix;
1889 file_str += ".txt";
1890
1891 ofstream outfile(file_str.c_str(), ofstream::out);
1892 if (!outfile) {
1893 cout << "error writing file: " << file_str.c_str() << endl;
1894 return;
1895 }
1896
1897 outfile.precision(10);
1898
1899 for (size_t i = 0; i < matrix_U->size1; ++i) {
1900 for (size_t j = 0; j < matrix_U->size2; ++j) {
1901 outfile << tab(j) << gsl_matrix_get(matrix_U, i, j);
1902 }
1903 outfile << endl;
1904 }
1905
1906 outfile.close();
1907 outfile.clear();
1908 return;
1909 }
1910
WriteVector(const gsl_vector * vector_D,const string suffix)1911 void PARAM::WriteVector(const gsl_vector *vector_D, const string suffix) {
1912 string file_str;
1913 file_str = path_out + "/" + file_out;
1914 file_str += ".";
1915 file_str += suffix;
1916 file_str += ".txt";
1917
1918 ofstream outfile(file_str.c_str(), ofstream::out);
1919 if (!outfile) {
1920 cout << "error writing file: " << file_str.c_str() << endl;
1921 return;
1922 }
1923
1924 outfile.precision(10);
1925
1926 for (size_t i = 0; i < vector_D->size; ++i) {
1927 outfile << gsl_vector_get(vector_D, i) << endl;
1928 }
1929
1930 outfile.close();
1931 outfile.clear();
1932 return;
1933 }
1934
CheckCvt()1935 void PARAM::CheckCvt() {
1936 if (indicator_cvt.size() == 0) {
1937 return;
1938 }
1939
1940 size_t ci_test = 0;
1941
1942 gsl_matrix *W = gsl_matrix_alloc(ni_test, n_cvt);
1943
1944 for (vector<int>::size_type i = 0; i < indicator_idv.size(); ++i) {
1945 if (indicator_idv[i] == 0 || indicator_cvt[i] == 0) {
1946 continue;
1947 }
1948 for (size_t j = 0; j < n_cvt; ++j) {
1949 gsl_matrix_set(W, ci_test, j, (cvt)[i][j]);
1950 }
1951 ci_test++;
1952 }
1953
1954 size_t flag_ipt = 0;
1955 double v_min, v_max;
1956 set<size_t> set_remove;
1957
1958 // Check if any columns is an intercept.
1959 for (size_t i = 0; i < W->size2; i++) {
1960 gsl_vector_view w_col = gsl_matrix_column(W, i);
1961 gsl_vector_minmax(&w_col.vector, &v_min, &v_max);
1962 if (v_min == v_max) {
1963 flag_ipt = 1;
1964 set_remove.insert(i);
1965 }
1966 }
1967
1968 // Add an intercept term if needed.
1969 if (n_cvt == set_remove.size()) {
1970 indicator_cvt.clear();
1971 n_cvt = 1;
1972 } else if (flag_ipt == 0) {
1973 info_msg("no intercept term is found in the cvt file: a column of 1s is added");
1974 for (vector<int>::size_type i = 0; i < indicator_idv.size(); ++i) {
1975 if (indicator_idv[i] == 0 || indicator_cvt[i] == 0) {
1976 continue;
1977 }
1978 cvt[i].push_back(1.0);
1979 }
1980
1981 n_cvt++;
1982 } else {
1983 }
1984
1985 gsl_matrix_free(W);
1986
1987 return;
1988 }
1989
1990 // Post-process phenotypes and covariates.
ProcessCvtPhen()1991 void PARAM::ProcessCvtPhen() {
1992
1993 // Convert indicator_pheno to indicator_idv.
1994 int k = 1;
1995 indicator_idv.clear();
1996 for (size_t i = 0; i < indicator_pheno.size(); i++) {
1997 k = 1;
1998 for (size_t j = 0; j < indicator_pheno[i].size(); j++) {
1999 if (indicator_pheno[i][j] == 0) {
2000 k = 0;
2001 }
2002 }
2003 indicator_idv.push_back(k);
2004 }
2005
2006 // Remove individuals with missing covariates.
2007 if ((indicator_cvt).size() != 0) {
2008 for (vector<int>::size_type i = 0; i < (indicator_idv).size(); ++i) {
2009 indicator_idv[i] *= indicator_cvt[i];
2010 }
2011 }
2012
2013 // Remove individuals with missing gxe variables.
2014 if ((indicator_gxe).size() != 0) {
2015 for (vector<int>::size_type i = 0; i < (indicator_idv).size(); ++i) {
2016 indicator_idv[i] *= indicator_gxe[i];
2017 }
2018 }
2019
2020 // Remove individuals with missing residual weights.
2021 if ((indicator_weight).size() != 0) {
2022 for (vector<int>::size_type i = 0; i < (indicator_idv).size(); ++i) {
2023 indicator_idv[i] *= indicator_weight[i];
2024 }
2025 }
2026
2027 // Obtain ni_test.
2028 ni_test = 0;
2029 for (vector<int>::size_type i = 0; i < (indicator_idv).size(); ++i) {
2030 if (indicator_idv[i] == 0) {
2031 continue;
2032 }
2033 ni_test++;
2034 }
2035
2036 // If subsample number is set, perform a random sub-sampling
2037 // to determine the subsampled ids.
2038 if (ni_subsample != 0) {
2039 if (ni_test < ni_subsample) {
2040 cout << "error! number of subsamples is less than number of"
2041 << "analyzed individuals. " << endl;
2042 } else {
2043
2044 // From ni_test, sub-sample ni_subsample.
2045 vector<size_t> a, b;
2046 for (size_t i = 0; i < ni_subsample; i++) {
2047 a.push_back(0);
2048 }
2049 for (size_t i = 0; i < ni_test; i++) {
2050 b.push_back(i);
2051 }
2052
2053 gsl_ran_choose(gsl_r, static_cast<void *>(&a[0]), ni_subsample,
2054 static_cast<void *>(&b[0]), ni_test, sizeof(size_t));
2055
2056 // Re-set indicator_idv and ni_test.
2057 int j = 0;
2058 for (vector<int>::size_type i = 0; i < (indicator_idv).size(); ++i) {
2059 if (indicator_idv[i] == 0) {
2060 continue;
2061 }
2062 if (find(a.begin(), a.end(), j) == a.end()) {
2063 indicator_idv[i] = 0;
2064 }
2065 j++;
2066 }
2067 ni_test = ni_subsample;
2068 }
2069 }
2070
2071 // Check ni_test.
2072 if (a_mode != M_BSLMM5)
2073 enforce_msg(ni_test,"number of analyzed individuals equals 0.");
2074 if (ni_test == 0 && a_mode != M_BSLMM5) {
2075 error = true;
2076 cout << "error! number of analyzed individuals equals 0. " << endl;
2077 }
2078
2079 // Check covariates to see if they are correlated with each
2080 // other, and to see if the intercept term is included.
2081 // After getting ni_test.
2082 // Add or remove covariates.
2083 if (indicator_cvt.size() != 0) {
2084 CheckCvt();
2085 } else {
2086 vector<double> cvt_row;
2087 cvt_row.push_back(1);
2088
2089 for (vector<int>::size_type i = 0; i < (indicator_idv).size(); ++i) {
2090 indicator_cvt.push_back(1);
2091 cvt.push_back(cvt_row);
2092 }
2093 }
2094
2095 return;
2096 }
2097
CopyCvt(gsl_matrix * W)2098 void PARAM::CopyCvt(gsl_matrix *W) {
2099 size_t ci_test = 0;
2100
2101 for (vector<int>::size_type i = 0; i < indicator_idv.size(); ++i) {
2102 if (indicator_idv[i] == 0 || indicator_cvt[i] == 0) {
2103 continue;
2104 }
2105 for (size_t j = 0; j < n_cvt; ++j) {
2106 gsl_matrix_set(W, ci_test, j, (cvt)[i][j]);
2107 }
2108 ci_test++;
2109 }
2110
2111 return;
2112 }
2113
CopyGxe(gsl_vector * env)2114 void PARAM::CopyGxe(gsl_vector *env) {
2115 size_t ci_test = 0;
2116
2117 for (vector<int>::size_type i = 0; i < indicator_idv.size(); ++i) {
2118 if (indicator_idv[i] == 0 || indicator_gxe[i] == 0) {
2119 continue;
2120 }
2121 gsl_vector_set(env, ci_test, gxe[i]);
2122 ci_test++;
2123 }
2124
2125 return;
2126 }
2127
CopyWeight(gsl_vector * w)2128 void PARAM::CopyWeight(gsl_vector *w) {
2129 size_t ci_test = 0;
2130
2131 for (vector<int>::size_type i = 0; i < indicator_idv.size(); ++i) {
2132 if (indicator_idv[i] == 0 || indicator_weight[i] == 0) {
2133 continue;
2134 }
2135 gsl_vector_set(w, ci_test, weight[i]);
2136 ci_test++;
2137 }
2138
2139 return;
2140 }
2141
2142 // If flag=0, then use indicator_idv to load W and Y;
2143 // else, use indicator_cvt to load them.
CopyCvtPhen(gsl_matrix * W,gsl_vector * y,size_t flag)2144 void PARAM::CopyCvtPhen(gsl_matrix *W, gsl_vector *y, size_t flag) {
2145 size_t ci_test = 0;
2146
2147 for (vector<int>::size_type i = 0; i < indicator_idv.size(); ++i) {
2148 if (flag == 0) {
2149 if (indicator_idv[i] == 0) {
2150 continue;
2151 }
2152 } else {
2153 if (indicator_cvt[i] == 0) {
2154 continue;
2155 }
2156 }
2157
2158 gsl_vector_set(y, ci_test, (pheno)[i][0]);
2159
2160 for (size_t j = 0; j < n_cvt; ++j) {
2161 gsl_matrix_set(W, ci_test, j, (cvt)[i][j]);
2162 }
2163 ci_test++;
2164 }
2165
2166 return;
2167 }
2168
2169 // If flag=0, then use indicator_idv to load W and Y;
2170 // else, use indicator_cvt to load them.
CopyCvtPhen(gsl_matrix * W,gsl_matrix * Y,size_t flag)2171 void PARAM::CopyCvtPhen(gsl_matrix *W, gsl_matrix *Y, size_t flag) {
2172 size_t ci_test = 0;
2173
2174 for (vector<int>::size_type i = 0; i < indicator_idv.size(); ++i) {
2175 if (flag == 0) {
2176 if (indicator_idv[i] == 0) {
2177 continue;
2178 }
2179 } else {
2180 if (indicator_cvt[i] == 0) {
2181 continue;
2182 }
2183 }
2184
2185 for (size_t j = 0; j < n_ph; ++j) {
2186 gsl_matrix_set(Y, ci_test, j, (pheno)[i][j]);
2187 }
2188 for (size_t j = 0; j < n_cvt; ++j) {
2189 gsl_matrix_set(W, ci_test, j, (cvt)[i][j]);
2190 }
2191
2192 ci_test++;
2193 }
2194
2195 return;
2196 }
2197
CopyRead(gsl_vector * log_N)2198 void PARAM::CopyRead(gsl_vector *log_N) {
2199 size_t ci_test = 0;
2200
2201 for (vector<int>::size_type i = 0; i < indicator_idv.size(); ++i) {
2202 if (indicator_idv[i] == 0) {
2203 continue;
2204 }
2205 gsl_vector_set(log_N, ci_test, log(vec_read[i]));
2206 ci_test++;
2207 }
2208
2209 return;
2210 }
2211
ObtainWeight(const set<string> & setSnps_beta,map<string,double> & mapRS2wK)2212 void PARAM::ObtainWeight(const set<string> &setSnps_beta,
2213 map<string, double> &mapRS2wK) {
2214 mapRS2wK.clear();
2215
2216 vector<double> wsum, wcount;
2217
2218 for (size_t i = 0; i < n_vc; i++) {
2219 wsum.push_back(0.0);
2220 wcount.push_back(0.0);
2221 }
2222
2223 string rs;
2224 if (msnpInfo.size() == 0) {
2225 for (size_t i = 0; i < snpInfo.size(); i++) {
2226 if (indicator_snp[i] == 0) {
2227 continue;
2228 }
2229
2230 rs = snpInfo[i].rs_number;
2231 if ((setSnps_beta.size() == 0 || setSnps_beta.count(rs) != 0) &&
2232 (mapRS2wsnp.size() == 0 || mapRS2wsnp.count(rs) != 0) &&
2233 (mapRS2wcat.size() == 0 || mapRS2wcat.count(rs) != 0) &&
2234 (mapRS2cat.size() == 0 || mapRS2cat.count(rs) != 0)) {
2235 if (mapRS2wsnp.size() != 0) {
2236 mapRS2wK[rs] = mapRS2wsnp[rs];
2237 if (mapRS2cat.size() == 0) {
2238 wsum[0] += mapRS2wsnp[rs];
2239 } else {
2240 wsum[mapRS2cat[rs]] += mapRS2wsnp[rs];
2241 }
2242 wcount[0]++;
2243 } else {
2244 mapRS2wK[rs] = 1;
2245 }
2246 }
2247 }
2248 } else {
2249 for (size_t t = 0; t < msnpInfo.size(); t++) {
2250 snpInfo = msnpInfo[t];
2251 indicator_snp = mindicator_snp[t];
2252
2253 for (size_t i = 0; i < snpInfo.size(); i++) {
2254 if (indicator_snp[i] == 0) {
2255 continue;
2256 }
2257
2258 rs = snpInfo[i].rs_number;
2259 if ((setSnps_beta.size() == 0 || setSnps_beta.count(rs) != 0) &&
2260 (mapRS2wsnp.size() == 0 || mapRS2wsnp.count(rs) != 0) &&
2261 (mapRS2wcat.size() == 0 || mapRS2wcat.count(rs) != 0) &&
2262 (mapRS2cat.size() == 0 || mapRS2cat.count(rs) != 0)) {
2263 if (mapRS2wsnp.size() != 0) {
2264 mapRS2wK[rs] = mapRS2wsnp[rs];
2265 if (mapRS2cat.size() == 0) {
2266 wsum[0] += mapRS2wsnp[rs];
2267 } else {
2268 wsum[mapRS2cat[rs]] += mapRS2wsnp[rs];
2269 }
2270 wcount[0]++;
2271 } else {
2272 mapRS2wK[rs] = 1;
2273 }
2274 }
2275 }
2276 }
2277 }
2278
2279 if (mapRS2wsnp.size() != 0) {
2280 for (size_t i = 0; i < n_vc; i++) {
2281 wsum[i] /= wcount[i];
2282 }
2283
2284 for (map<string, double>::iterator it = mapRS2wK.begin();
2285 it != mapRS2wK.end(); ++it) {
2286 if (mapRS2cat.size() == 0) {
2287 it->second /= wsum[0];
2288 } else {
2289 it->second /= wsum[mapRS2cat[it->first]];
2290 }
2291 }
2292 }
2293 return;
2294 }
2295
2296 // If pve_flag=0 then do not change pve; pve_flag==1, then change pve
2297 // to 0 if pve < 0 and pve to 1 if pve > 1.
UpdateWeight(const size_t pve_flag,const map<string,double> & mapRS2wK,const size_t ni_test,const gsl_vector * ns,map<string,double> & mapRS2wA)2298 void PARAM::UpdateWeight(const size_t pve_flag,
2299 const map<string, double> &mapRS2wK,
2300 const size_t ni_test, const gsl_vector *ns,
2301 map<string, double> &mapRS2wA) {
2302 double d;
2303 vector<double> wsum, wcount;
2304
2305 for (size_t i = 0; i < n_vc; i++) {
2306 wsum.push_back(0.0);
2307 wcount.push_back(0.0);
2308 }
2309
2310 for (map<string, double>::const_iterator it = mapRS2wK.begin();
2311 it != mapRS2wK.end(); ++it) {
2312 d = 1;
2313 for (size_t i = 0; i < n_vc; i++) {
2314 if (v_pve[i] >= 1 && pve_flag == 1) {
2315 d += (double)ni_test / gsl_vector_get(ns, i) * mapRS2wcat[it->first][i];
2316 } else if (v_pve[i] <= 0 && pve_flag == 1) {
2317 d += 0;
2318 } else {
2319 d += (double)ni_test / gsl_vector_get(ns, i) *
2320 mapRS2wcat[it->first][i] * v_pve[i];
2321 }
2322 }
2323 mapRS2wA[it->first] = 1 / (d * d);
2324
2325 if (mapRS2cat.size() == 0) {
2326 wsum[0] += mapRS2wA[it->first];
2327 wcount[0]++;
2328 } else {
2329 wsum[mapRS2cat[it->first]] += mapRS2wA[it->first];
2330 wcount[mapRS2cat[it->first]]++;
2331 }
2332 }
2333
2334 for (size_t i = 0; i < n_vc; i++) {
2335 wsum[i] /= wcount[i];
2336 }
2337
2338 for (map<string, double>::iterator it = mapRS2wA.begin();
2339 it != mapRS2wA.end(); ++it) {
2340 if (mapRS2cat.size() == 0) {
2341 it->second /= wsum[0];
2342 } else {
2343 it->second /= wsum[mapRS2cat[it->first]];
2344 }
2345 }
2346 return;
2347 }
2348
2349 // This function updates indicator_snp, and save z-scores and other
2350 // values into vectors.
UpdateSNPnZ(const map<string,double> & mapRS2wA,const map<string,string> & mapRS2A1,const map<string,double> & mapRS2z,gsl_vector * w,gsl_vector * z,vector<size_t> & vec_cat)2351 void PARAM::UpdateSNPnZ(const map<string, double> &mapRS2wA,
2352 const map<string, string> &mapRS2A1,
2353 const map<string, double> &mapRS2z, gsl_vector *w,
2354 gsl_vector *z, vector<size_t> &vec_cat) {
2355 gsl_vector_set_zero(w);
2356 gsl_vector_set_zero(z);
2357 vec_cat.clear();
2358
2359 string rs, a1;
2360 size_t c = 0;
2361 if (msnpInfo.size() == 0) {
2362 for (size_t i = 0; i < snpInfo.size(); i++) {
2363 if (indicator_snp[i] == 0) {
2364 continue;
2365 }
2366
2367 rs = snpInfo[i].rs_number;
2368 a1 = snpInfo[i].a_minor;
2369
2370 if (mapRS2wA.count(rs) != 0) {
2371 if (a1 == mapRS2A1.at(rs)) {
2372 gsl_vector_set(z, c, mapRS2z.at(rs));
2373 } else {
2374 gsl_vector_set(z, c, -1 * mapRS2z.at(rs));
2375 }
2376 vec_cat.push_back(mapRS2cat.at(rs));
2377 gsl_vector_set(w, c, mapRS2wA.at(rs));
2378
2379 c++;
2380 } else {
2381 indicator_snp[i] = 0;
2382 }
2383 }
2384 } else {
2385 for (size_t t = 0; t < msnpInfo.size(); t++) {
2386 snpInfo = msnpInfo[t];
2387
2388 for (size_t i = 0; i < snpInfo.size(); i++) {
2389 if (mindicator_snp[t][i] == 0) {
2390 continue;
2391 }
2392
2393 rs = snpInfo[i].rs_number;
2394 a1 = snpInfo[i].a_minor;
2395
2396 if (mapRS2wA.count(rs) != 0) {
2397 if (a1 == mapRS2A1.at(rs)) {
2398 gsl_vector_set(z, c, mapRS2z.at(rs));
2399 } else {
2400 gsl_vector_set(z, c, -1 * mapRS2z.at(rs));
2401 }
2402 vec_cat.push_back(mapRS2cat.at(rs));
2403 gsl_vector_set(w, c, mapRS2wA.at(rs));
2404
2405 c++;
2406 } else {
2407 mindicator_snp[t][i] = 0;
2408 }
2409 }
2410 }
2411 }
2412
2413 return;
2414 }
2415
2416 // This function updates indicator_snp, and save z-scores and other
2417 // values into vectors.
UpdateSNP(const map<string,double> & mapRS2wA)2418 void PARAM::UpdateSNP(const map<string, double> &mapRS2wA) {
2419 string rs;
2420 if (msnpInfo.size() == 0) {
2421 for (size_t i = 0; i < snpInfo.size(); i++) {
2422 if (indicator_snp[i] == 0) {
2423 continue;
2424 }
2425
2426 rs = snpInfo[i].rs_number;
2427
2428 if (mapRS2wA.count(rs) == 0) {
2429 indicator_snp[i] = 0;
2430 }
2431 }
2432 } else {
2433 for (size_t t = 0; t < msnpInfo.size(); t++) {
2434 snpInfo = msnpInfo[t];
2435
2436 for (size_t i = 0; i < mindicator_snp[t].size(); i++) {
2437 if (mindicator_snp[t][i] == 0) {
2438 continue;
2439 }
2440
2441 rs = snpInfo[i].rs_number;
2442
2443 if (mapRS2wA.count(rs) == 0) {
2444 mindicator_snp[t][i] = 0;
2445 }
2446 }
2447 }
2448 }
2449
2450 return;
2451 }
2452