1 /*
2     Genome-wide Efficient Mixed Model Association (GEMMA)
3     Copyright © 2011-2017, Xiang Zhou
4     Copyright © 2017, Peter Carbonetto
5     Copyright © 2017, Pjotr Prins
6 
7     This program is free software: you can redistribute it and/or modify
8     it under the terms of the GNU General Public License as published by
9     the Free Software Foundation, either version 3 of the License, or
10     (at your option) any later version.
11 
12     This program is distributed in the hope that it will be useful,
13     but WITHOUT ANY WARRANTY; without even the implied warranty of
14     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15     GNU General Public License for more details.
16 
17     You should have received a copy of the GNU General Public License
18     along with this program. If not, see <http://www.gnu.org/licenses/>.
19 */
20 
21 #include <iostream>
22 #include <iomanip>
23 #include <string>
24 #include <algorithm>
25 #include <cmath>
26 #include <cstring>
27 #include <fstream>
28 #include <sys/stat.h>
29 
30 #include "gsl/gsl_blas.h"
31 #include "gsl/gsl_linalg.h"
32 #include "gsl/gsl_matrix.h"
33 #include "gsl/gsl_matrix.h"
34 #include "gsl/gsl_randist.h"
35 #include "gsl/gsl_vector.h"
36 
37 #include "eigenlib.h"
38 #include "gemma.h"
39 #include "gemma_io.h"
40 #include "mathfunc.h"
41 #include "param.h"
42 #include "fastblas.h"
43 
44 using namespace std;
45 
46 // ---- Helper functions which do not use the PARAM scope
47 
48 // Calculate the SNP sets as used in LOCO from mapchr which was filled
49 // from the annotation file. Returns ksnps (used for K) and gwasnps (used for
50 // GWA). Both are complimentary with each other and subsets of setSnps.
51 
LOCO_set_Snps(set<string> & ksnps,set<string> & gwasnps,const map<string,string> mapchr,const string loco)52 void LOCO_set_Snps(set<string> &ksnps, set<string> &gwasnps,
53                    const map<string, string> mapchr, const string loco) {
54   enforce_msg(ksnps.size() == 0, "make sure knsps is not initialized twice");
55   enforce_msg(gwasnps.size() == 0,
56               "make sure gwasnps is not initialized twice");
57   for (auto &kv : mapchr) {
58     auto snp = kv.first;
59     auto chr = kv.second;
60     if (chr != loco) {
61       ksnps.insert(snp);
62     } else {
63       gwasnps.insert(snp);
64     }
65   }
66 }
67 
68 // Trim #individuals to size which is used to write tests that run faster
69 //
70 // Note it actually trims the number of functional individuals
71 // (indicator_idv[x] == 1). This should match indicator_cvt etc. If
72 // this gives problems with certain sets we can simply trim to size.
73 
trim_individuals(vector<int> & idvs,size_t ni_max)74 void trim_individuals(vector<int> &idvs, size_t ni_max) {
75   if (ni_max) {
76     size_t count = 0;
77     for (auto ind = idvs.begin(); ind != idvs.end(); ++ind) {
78       if (*ind)
79         count++;
80       if (count >= ni_max)
81         break;
82     }
83     if (count != idvs.size()) {
84       if (is_debug_mode())
85         cout << "**** TEST MODE: trim individuals from " << idvs.size()
86              << " to " << count << endl;
87       idvs.resize(count);
88     }
89   }
90 }
91 
92 // ---- PARAM class implementation
93 
PARAM(void)94 PARAM::PARAM(void)
95     : a_mode(0), k_mode(1), d_pace(DEFAULT_PACE),
96       file_out("result"), path_out("./output/"), miss_level(0.05),
97       maf_level(0.01), hwe_level(0), r2_level(0.9999), l_min(1e-5), l_max(1e5),
98       n_region(10), p_nr(0.001), em_prec(0.0001), nr_prec(0.0001),
99       em_iter(10000), nr_iter(100), crt(0), pheno_mean(0), noconstrain(false),
100       h_min(-1), h_max(-1), h_scale(-1), rho_min(0.0), rho_max(1.0),
101       rho_scale(-1), logp_min(0.0), logp_max(0.0), logp_scale(-1), h_ngrid(10),
102       rho_ngrid(10), s_min(0), s_max(300), w_step(100000), s_step(1000000),
103       r_pace(10), w_pace(1000), n_accept(0), n_mh(10), geo_mean(2000.0),
104       randseed(-1), window_cm(0), window_bp(0), window_ns(0), n_block(200),
105       error(false), ni_subsample(0), n_cvt(1), n_cat(0), n_vc(1),
106       time_total(0.0), time_G(0.0), time_eigen(0.0), time_UtX(0.0),
107       time_UtZ(0.0), time_opt(0.0), time_Omega(0.0) {}
108 
~PARAM()109 PARAM::~PARAM() {
110   gsl_rng_free(gsl_r);
111 }
112 
113 // Read files: obtain ns_total, ng_total, ns_test, ni_test.
ReadFiles(void)114 void PARAM::ReadFiles(void) {
115   string file_str;
116 
117   // Read cat file.
118   if (!file_mcat.empty()) {
119     if (ReadFile_mcat(file_mcat, mapRS2cat, n_vc) == false) {
120       error = true;
121     }
122   } else if (!file_cat.empty()) {
123     if (ReadFile_cat(file_cat, mapRS2cat, n_vc) == false) {
124       error = true;
125     }
126   }
127 
128   // Read snp weight files.
129   if (!file_wcat.empty()) {
130     if (ReadFile_wsnp(file_wcat, n_vc, mapRS2wcat) == false) {
131       error = true;
132     }
133   }
134   if (!file_wsnp.empty()) {
135     if (ReadFile_wsnp(file_wsnp, mapRS2wsnp) == false) {
136       error = true;
137     }
138   }
139 
140   // Count number of kinship files.
141   if (!file_mk.empty()) {
142     if (CountFileLines(file_mk, n_vc) == false) {
143       error = true;
144     }
145   }
146 
147   // Read SNP set.
148   if (!file_snps.empty()) {
149     if (ReadFile_snps(file_snps, setSnps) == false) {
150       error = true;
151     }
152   } else {
153     setSnps.clear();
154   }
155 
156   // Read KSNP set.
157   if (!file_ksnps.empty()) {
158     if (ReadFile_snps(file_ksnps, setKSnps) == false) {
159       error = true;
160     }
161   } else {
162     setKSnps.clear();
163   }
164 
165   // For prediction.
166   if (!file_epm.empty()) {
167     if (ReadFile_est(file_epm, est_column, mapRS2est) == false) {
168       error = true;
169     }
170     if (!file_bfile.empty()) {
171       file_str = file_bfile + ".bim";
172       if (ReadFile_bim(file_str, snpInfo) == false) {
173         error = true;
174       }
175       file_str = file_bfile + ".fam";
176       if (ReadFile_fam(file_str, indicator_pheno, pheno, mapID2num, p_column) ==
177           false) {
178         error = true;
179       }
180     }
181 
182     if (!file_geno.empty()) {
183       if (ReadFile_pheno(file_pheno, indicator_pheno, pheno, p_column) ==
184           false) {
185         error = true;
186       }
187 
188       if (CountFileLines(file_geno, ns_total) == false) {
189         error = true;
190       }
191     }
192 
193     if (!file_ebv.empty()) {
194       if (ReadFile_column(file_ebv, indicator_bv, vec_bv, 1) == false) {
195         error = true;
196       }
197     }
198 
199     if (!file_log.empty()) {
200       if (ReadFile_log(file_log, pheno_mean) == false) {
201         error = true;
202       }
203     }
204 
205     // Convert indicator_pheno to indicator_idv.
206     int k = 1;
207     for (size_t i = 0; i < indicator_pheno.size(); i++) {
208       k = 1;
209       for (size_t j = 0; j < indicator_pheno[i].size(); j++) {
210         if (indicator_pheno[i][j] == 0) {
211           k = 0;
212         }
213       }
214       indicator_idv.push_back(k);
215     }
216 
217     ns_test = 0;
218 
219     return;
220   }
221 
222   // Read covariates before the genotype files.
223   if (!file_cvt.empty()) {
224     if (ReadFile_cvt(file_cvt, indicator_cvt, cvt, n_cvt) == false) {
225       error = true;
226     }
227     if ((indicator_cvt).size() == 0) {
228       n_cvt = 1;
229     }
230   } else {
231     n_cvt = 1;
232   }
233   trim_individuals(indicator_cvt, ni_max);
234 
235   if (!file_gxe.empty()) {
236     if (ReadFile_column(file_gxe, indicator_gxe, gxe, 1) == false) {
237       error = true;
238     }
239   }
240   if (!file_weight.empty()) {
241     if (ReadFile_column(file_weight, indicator_weight, weight, 1) == false) {
242       error = true;
243     }
244   }
245 
246   trim_individuals(indicator_idv, ni_max);
247 
248   // Read genotype and phenotype file for PLINK format.
249   if (!file_bfile.empty()) {
250     file_str = file_bfile + ".bim";
251     snpInfo.clear();
252     if (ReadFile_bim(file_str, snpInfo) == false) {
253       error = true;
254     }
255 
256     // If both fam file and pheno files are used, use
257     // phenotypes inside the pheno file.
258     if (!file_pheno.empty()) {
259 
260       // Phenotype file before genotype file.
261       if (ReadFile_pheno(file_pheno, indicator_pheno, pheno, p_column) ==
262           false) {
263         error = true;
264       }
265     } else {
266       file_str = file_bfile + ".fam";
267       if (ReadFile_fam(file_str, indicator_pheno, pheno, mapID2num, p_column) ==
268           false) {
269         error = true;
270       }
271     }
272 
273     // Post-process covariates and phenotypes, obtain
274     // ni_test, save all useful covariates.
275     ProcessCvtPhen();
276 
277     // Obtain covariate matrix.
278     auto W1 = gsl_matrix_safe_alloc(ni_test, n_cvt);
279     CopyCvt(W1);
280 
281     file_str = file_bfile + ".bed";
282     if (ReadFile_bed(file_str, setSnps, W1, indicator_idv, indicator_snp,
283                      snpInfo, maf_level, miss_level, hwe_level, r2_level,
284                      ns_test) == false) {
285       error = true;
286     }
287     gsl_matrix_free(W1);
288     ns_total = indicator_snp.size();
289   }
290 
291   // Read genotype and phenotype file for BIMBAM format.
292   if (!file_geno.empty()) {
293 
294     // Annotation file before genotype file.
295     if (!file_anno.empty()) {
296       if (ReadFile_anno(file_anno, mapRS2chr, mapRS2bp, mapRS2cM) == false) {
297         error = true;
298       }
299     }
300 
301     // Phenotype file before genotype file.
302     if (ReadFile_pheno(file_pheno, indicator_pheno, pheno, p_column) == false) {
303       error = true;
304     }
305 
306     // Post-process covariates and phenotypes, obtain
307     // ni_test, save all useful covariates.
308     ProcessCvtPhen();
309 
310     // Obtain covariate matrix.
311     auto W2 = gsl_matrix_safe_alloc(ni_test, n_cvt);
312     CopyCvt(W2);
313 
314     trim_individuals(indicator_idv, ni_max);
315     trim_individuals(indicator_cvt, ni_max);
316     if (ReadFile_geno(file_geno, setSnps, W2, indicator_idv, indicator_snp,
317                       maf_level, miss_level, hwe_level, r2_level, mapRS2chr,
318                       mapRS2bp, mapRS2cM, snpInfo, ns_test) == false) {
319       error = true;
320       return;
321     }
322     gsl_matrix_free(W2);
323 
324     ns_total = indicator_snp.size();
325   }
326 
327   // Read genotype file for multiple PLINK files.
328   if (!file_mbfile.empty()) {
329     igzstream infile(file_mbfile.c_str(), igzstream::in);
330     enforce_msg(infile,"fail to open mbfile file");
331     string file_name;
332     size_t t = 0, ns_test_tmp = 0;
333     gsl_matrix *W3 = NULL;
334     while (!safeGetline(infile, file_name).eof()) {
335       file_str = file_name + ".bim";
336 
337       if (ReadFile_bim(file_str, snpInfo) == false) {
338         error = true;
339       }
340 
341       if (t == 0) {
342 
343         // If both fam file and pheno files are used, use
344         // phenotypes inside the pheno file.
345         if (!file_pheno.empty()) {
346 
347           // Phenotype file before genotype file.
348           if (ReadFile_pheno(file_pheno, indicator_pheno, pheno, p_column) ==
349               false) {
350             error = true;
351           }
352         } else {
353           file_str = file_name + ".fam";
354           if (ReadFile_fam(file_str, indicator_pheno, pheno, mapID2num,
355                            p_column) == false) {
356             error = true;
357           }
358         }
359 
360         // Post-process covariates and phenotypes, obtain
361         // ni_test, save all useful covariates.
362         ProcessCvtPhen();
363 
364         // Obtain covariate matrix.
365         W3 = gsl_matrix_safe_alloc(ni_test, n_cvt);
366         CopyCvt(W3);
367       }
368 
369       file_str = file_name + ".bed";
370       if (ReadFile_bed(file_str, setSnps, W3, indicator_idv, indicator_snp,
371                        snpInfo, maf_level, miss_level, hwe_level, r2_level,
372                        ns_test_tmp) == false) {
373         error = true;
374       }
375       mindicator_snp.push_back(indicator_snp);
376       msnpInfo.push_back(snpInfo);
377       ns_test += ns_test_tmp;
378       ns_total += indicator_snp.size();
379 
380       t++;
381     }
382 
383     if (W3) gsl_matrix_free(W3);
384 
385     infile.close();
386     infile.clear();
387   }
388 
389   // Read genotype and phenotype file for multiple BIMBAM files.
390   if (!file_mgeno.empty()) {
391 
392     // Annotation file before genotype file.
393     if (!file_anno.empty()) {
394       if (ReadFile_anno(file_anno, mapRS2chr, mapRS2bp, mapRS2cM) == false) {
395         error = true;
396       }
397     }
398 
399     // Phenotype file before genotype file.
400     if (ReadFile_pheno(file_pheno, indicator_pheno, pheno, p_column) == false) {
401       error = true;
402     }
403 
404     // Post-process covariates and phenotypes, obtain ni_test,
405     // save all useful covariates.
406     ProcessCvtPhen();
407 
408     // Obtain covariate matrix.
409     gsl_matrix *W4 = gsl_matrix_safe_alloc(ni_test, n_cvt);
410     CopyCvt(W4);
411 
412     igzstream infile(file_mgeno.c_str(), igzstream::in);
413     if (!infile) {
414       cout << "error! fail to open mgeno file: " << file_mgeno << endl;
415       error = true;
416       return;
417     }
418 
419     string file_name;
420     size_t ns_test_tmp;
421     while (!safeGetline(infile, file_name).eof()) {
422       if (ReadFile_geno(file_name, setSnps, W4, indicator_idv, indicator_snp,
423                         maf_level, miss_level, hwe_level, r2_level, mapRS2chr,
424                         mapRS2bp, mapRS2cM, snpInfo, ns_test_tmp) == false) {
425         error = true;
426       }
427 
428       mindicator_snp.push_back(indicator_snp);
429       msnpInfo.push_back(snpInfo);
430       ns_test += ns_test_tmp;
431       ns_total += indicator_snp.size();
432     }
433 
434     gsl_matrix_free(W4);
435 
436     infile.close();
437     infile.clear();
438   }
439 
440   if (!file_gene.empty()) {
441     if (ReadFile_pheno(file_pheno, indicator_pheno, pheno, p_column) == false) {
442       error = true;
443     }
444 
445     // Convert indicator_pheno to indicator_idv.
446     int k = 1;
447     for (size_t i = 0; i < indicator_pheno.size(); i++) {
448       k = 1;
449       for (size_t j = 0; j < indicator_pheno[i].size(); j++) {
450         if (indicator_pheno[i][j] == 0) {
451           k = 0;
452         }
453       }
454       indicator_idv.push_back(k);
455     }
456 
457     // Post-process covariates and phenotypes, obtain
458     // ni_test, save all useful covariates.
459     ProcessCvtPhen();
460 
461     // Obtain covariate matrix.
462     // gsl_matrix *W5 = gsl_matrix_alloc(ni_test, n_cvt);
463     // CopyCvt(W5);
464 
465     if (ReadFile_gene(file_gene, vec_read, snpInfo, ng_total) == false) {
466       error = true;
467     }
468   }
469 
470   // Read is after gene file.
471   if (!file_read.empty()) {
472     if (ReadFile_column(file_read, indicator_read, vec_read, 1) == false) {
473       error = true;
474     }
475 
476     ni_test = 0;
477     for (vector<int>::size_type i = 0; i < (indicator_idv).size(); ++i) {
478       indicator_idv[i] *= indicator_read[i];
479       ni_test += indicator_idv[i];
480     }
481 
482     enforce_msg(ni_test,"number of analyzed individuals equals 0.");
483   }
484 
485   // For ridge prediction, read phenotype only.
486   if (file_geno.empty() && file_gene.empty() && !file_pheno.empty()) {
487     if (ReadFile_pheno(file_pheno, indicator_pheno, pheno, p_column) == false) {
488       error = true;
489     }
490 
491     // Post-process covariates and phenotypes, obtain
492     // ni_test, save all useful covariates.
493     ProcessCvtPhen();
494   }
495 
496   // Compute setKSnps when -loco is passed in
497   if (!loco.empty()) {
498     LOCO_set_Snps(setKSnps, setGWASnps, mapRS2chr, loco);
499   }
500   return;
501 }
502 
CheckParam(void)503 void PARAM::CheckParam(void) {
504   struct stat fileInfo;
505   string str;
506 
507   // Check parameters.
508   if (k_mode != 1 && k_mode != 2) {
509     cout << "error! unknown kinship/relatedness input mode: " << k_mode << endl;
510     error = true;
511   }
512   if (a_mode != 1 && a_mode != 2 && a_mode != 3 && a_mode != 4 && a_mode != 5 &&
513       a_mode != 11 && a_mode != 12 && a_mode != 13 && a_mode != 14 &&
514       a_mode != 15 && a_mode != 21 && a_mode != 22 && a_mode != 25 &&
515       a_mode != 26 && a_mode != 27 && a_mode != 28 && a_mode != 31 &&
516       a_mode != 41 && a_mode != 42 && a_mode != 43 && a_mode != 51 &&
517       a_mode != 52 && a_mode != 53 && a_mode != 54 && a_mode != 61 &&
518       a_mode != 62 && a_mode != 63 && a_mode != 66 && a_mode != 67 &&
519       a_mode != 71) {
520     cout << "error! unknown analysis mode: " << a_mode
521          << ". make sure -gk or -eigen or -lmm or -bslmm -predict or "
522          << "-calccov is specified correctly." << endl;
523     error = true;
524   }
525   if (miss_level > 1) {
526     cout << "error! missing level needs to be between 0 and 1. "
527          << "current value = " << miss_level << endl;
528     error = true;
529   }
530   if (maf_level > 0.5) {
531     cout << "error! maf level needs to be between 0 and 0.5. "
532          << "current value = " << maf_level << endl;
533     error = true;
534   }
535   if (hwe_level > 1) {
536     cout << "error! hwe level needs to be between 0 and 1. "
537          << "current value = " << hwe_level << endl;
538     error = true;
539   }
540   if (r2_level > 1) {
541     cout << "error! r2 level needs to be between 0 and 1. "
542          << "current value = " << r2_level << endl;
543     error = true;
544   }
545 
546   if (l_max < l_min) {
547     cout << "error! maximum lambda value must be larger than the "
548          << "minimal value. current values = " << l_max << " and " << l_min
549          << endl;
550     error = true;
551   }
552   if (h_max < h_min) {
553     cout << "error! maximum h value must be larger than the minimal "
554          << "value. current values = " << h_max << " and " << h_min << endl;
555     error = true;
556   }
557   if (s_max < s_min) {
558     cout << "error! maximum s value must be larger than the minimal "
559          << "value. current values = " << s_max << " and " << s_min << endl;
560     error = true;
561   }
562   if (rho_max < rho_min) {
563     cout << "error! maximum rho value must be larger than the"
564          << "minimal value. current values = " << rho_max << " and " << rho_min
565          << endl;
566     error = true;
567   }
568   if (logp_max < logp_min) {
569     cout << "error! maximum logp value must be larger than the "
570          << "minimal value. current values = " << logp_max / log(10) << " and "
571          << logp_min / log(10) << endl;
572     error = true;
573   }
574 
575   if (h_max > 1) {
576     cout << "error! h values must be bewtween 0 and 1. current "
577          << "values = " << h_max << " and " << h_min << endl;
578     error = true;
579   }
580   if (rho_max > 1) {
581     cout << "error! rho values must be between 0 and 1. current "
582          << "values = " << rho_max << " and " << rho_min << endl;
583     error = true;
584   }
585   if (logp_max > 0) {
586     cout << "error! maximum logp value must be smaller than 0. "
587          << "current values = " << logp_max / log(10) << " and "
588          << logp_min / log(10) << endl;
589     error = true;
590   }
591   if (l_max < l_min) {
592     cout << "error! maximum lambda value must be larger than the "
593          << "minimal value. current values = " << l_max << " and " << l_min
594          << endl;
595     error = true;
596   }
597 
598   if (h_scale > 1.0) {
599     cout << "error! hscale value must be between 0 and 1. "
600          << "current value = " << h_scale << endl;
601     error = true;
602   }
603   if (rho_scale > 1.0) {
604     cout << "error! rscale value must be between 0 and 1. "
605          << "current value = " << rho_scale << endl;
606     error = true;
607   }
608   if (logp_scale > 1.0) {
609     cout << "error! pscale value must be between 0 and 1. "
610          << "current value = " << logp_scale << endl;
611     error = true;
612   }
613 
614   if (rho_max == 1 && rho_min == 1 && a_mode == 12) {
615     cout << "error! ridge regression does not support a rho "
616          << "parameter. current values = " << rho_max << " and " << rho_min
617          << endl;
618     error = true;
619   }
620 
621   if (window_cm < 0) {
622     cout << "error! windowcm values must be non-negative. "
623          << "current values = " << window_cm << endl;
624     error = true;
625   }
626 
627   if (window_cm == 0 && window_bp == 0 && window_ns == 0) {
628     window_bp = 1000000;
629   }
630 
631   // Check p_column, and (no need to) sort p_column into
632   // ascending order.
633   if (p_column.size() == 0) {
634     p_column.push_back(1);
635   } else {
636     for (size_t i = 0; i < p_column.size(); i++) {
637       for (size_t j = 0; j < i; j++) {
638         if (p_column[i] == p_column[j]) {
639           cout << "error! identical phenotype "
640                << "columns: " << p_column[i] << endl;
641           error = true;
642         }
643       }
644     }
645   }
646 
647   n_ph = p_column.size();
648 
649   // Only LMM option (and one prediction option) can deal with
650   // multiple phenotypes and no gene expression files.
651   if (n_ph > 1 && a_mode != 1 && a_mode != 2 && a_mode != 3 && a_mode != 4 &&
652       a_mode != 43) {
653     cout << "error! the current analysis mode " << a_mode
654          << " can not deal with multiple phenotypes." << endl;
655     error = true;
656   }
657   if (n_ph > 1 && !file_gene.empty()) {
658     cout << "error! multiple phenotype analysis option not "
659          << "allowed with gene expression files. " << endl;
660     error = true;
661   }
662 
663   if (p_nr > 1) {
664     cout << "error! pnr value must be between 0 and 1. current value = " << p_nr
665          << endl;
666     error = true;
667   }
668 
669   // check est_column
670   if (est_column.size() == 0) {
671     if (file_ebv.empty()) {
672       est_column.push_back(2);
673       est_column.push_back(5);
674       est_column.push_back(6);
675       est_column.push_back(7);
676     } else {
677       est_column.push_back(2);
678       est_column.push_back(0);
679       est_column.push_back(6);
680       est_column.push_back(7);
681     }
682   }
683 
684   if (est_column.size() != 4) {
685     cout << "error! -en not followed by four numbers. current number = "
686          << est_column.size() << endl;
687     error = true;
688   }
689   if (est_column[0] == 0) {
690     cout << "error! -en rs column can not be zero. current number = "
691          << est_column.size() << endl;
692     error = true;
693   }
694 
695   // Check if files are compatible with each other, and if files exist.
696   if (!file_bfile.empty()) {
697     str = file_bfile + ".bim";
698     if (stat(str.c_str(), &fileInfo) == -1) {
699       cout << "error! fail to open .bim file: " << str << endl;
700       error = true;
701     }
702     str = file_bfile + ".bed";
703     if (stat(str.c_str(), &fileInfo) == -1) {
704       cout << "error! fail to open .bed file: " << str << endl;
705       error = true;
706     }
707     str = file_bfile + ".fam";
708     if (stat(str.c_str(), &fileInfo) == -1) {
709       cout << "error! fail to open .fam file: " << str << endl;
710       error = true;
711     }
712   }
713 
714   if ((!file_geno.empty() || !file_gene.empty())) {
715     str = file_pheno;
716     if (stat(str.c_str(), &fileInfo) == -1) {
717       cout << "error! fail to open phenotype file: " << str << endl;
718       error = true;
719     }
720   }
721 
722   str = file_geno;
723   if (!str.empty() && stat(str.c_str(), &fileInfo) == -1) {
724     cout << "error! fail to open mean genotype file: " << str << endl;
725     error = true;
726   }
727 
728   str = file_gene;
729   if (!str.empty() && stat(str.c_str(), &fileInfo) == -1) {
730     cout << "error! fail to open gene expression file: " << str << endl;
731     error = true;
732   }
733 
734   str = file_cat;
735   if (!str.empty() && stat(str.c_str(), &fileInfo) == -1) {
736     cout << "error! fail to open category file: " << str << endl;
737     error = true;
738   }
739 
740   str = file_mcat;
741   if (!str.empty() && stat(str.c_str(), &fileInfo) == -1) {
742     cout << "error! fail to open mcategory file: " << str << endl;
743     error = true;
744   }
745 
746   str = file_beta;
747   if (!str.empty() && stat(str.c_str(), &fileInfo) == -1) {
748     cout << "error! fail to open beta file: " << str << endl;
749     error = true;
750   }
751 
752   str = file_cor;
753   if (!str.empty() && stat(str.c_str(), &fileInfo) == -1) {
754     cout << "error! fail to open correlation file: " << str << endl;
755     error = true;
756   }
757 
758   if (!file_study.empty()) {
759     str = file_study + ".Vq.txt";
760     if (stat(str.c_str(), &fileInfo) == -1) {
761       cout << "error! fail to open .Vq.txt file: " << str << endl;
762       error = true;
763     }
764     str = file_study + ".q.txt";
765     if (stat(str.c_str(), &fileInfo) == -1) {
766       cout << "error! fail to open .q.txt file: " << str << endl;
767       error = true;
768     }
769     str = file_study + ".size.txt";
770     if (stat(str.c_str(), &fileInfo) == -1) {
771       cout << "error! fail to open .size.txt file: " << str << endl;
772       error = true;
773     }
774   }
775 
776   if (!file_ref.empty()) {
777     str = file_ref + ".S.txt";
778     if (stat(str.c_str(), &fileInfo) == -1) {
779       cout << "error! fail to open .S.txt file: " << str << endl;
780       error = true;
781     }
782     str = file_ref + ".size.txt";
783     if (stat(str.c_str(), &fileInfo) == -1) {
784       cout << "error! fail to open .size.txt file: " << str << endl;
785       error = true;
786     }
787   }
788 
789   str = file_mstudy;
790   if (!str.empty() && stat(str.c_str(), &fileInfo) == -1) {
791     cout << "error! fail to open mstudy file: " << str << endl;
792     error = true;
793   }
794 
795   str = file_mref;
796   if (!str.empty() && stat(str.c_str(), &fileInfo) == -1) {
797     cout << "error! fail to open mref file: " << str << endl;
798     error = true;
799   }
800 
801   str = file_mgeno;
802   if (!str.empty() && stat(str.c_str(), &fileInfo) == -1) {
803     cout << "error! fail to open mgeno file: " << str << endl;
804     error = true;
805   }
806 
807   str = file_mbfile;
808   if (!str.empty() && stat(str.c_str(), &fileInfo) == -1) {
809     cout << "error! fail to open mbfile file: " << str << endl;
810     error = true;
811   }
812 
813   size_t flag = 0;
814   if (!file_bfile.empty()) {
815     flag++;
816   }
817   if (!file_geno.empty()) {
818     flag++;
819   }
820   if (!file_gene.empty()) {
821     flag++;
822   }
823 
824   // Always set up random environment.
825   gsl_rng_env_setup(); // sets gsl_rng_default_seed
826   const gsl_rng_type *T = gsl_rng_default; // pick up environment GSL_RNG_SEED
827 
828   if (randseed >= 0)
829     gsl_rng_default_seed = randseed; // CLI option used
830   else if (gsl_rng_default_seed == 0) { // by default we will randomize the seed
831     time_t rawtime;
832     time(&rawtime);
833     tm *ptm = gmtime(&rawtime);
834 
835     gsl_rng_default_seed =
836       (unsigned)(ptm->tm_hour % 24 * 3600 + ptm->tm_min * 60 + ptm->tm_sec);
837   }
838   gsl_r = gsl_rng_alloc(T);
839 
840   if (is_debug_mode()) {
841     printf ("GSL random generator type: %s; ", gsl_rng_name (gsl_r));
842     printf ("seed = %lu (option %li); ", gsl_rng_default_seed, randseed);
843     printf ("first value = %lu\n", gsl_rng_get (gsl_r));
844   }
845 
846   if (flag != 1 && a_mode != 15 && a_mode != 27 && a_mode != 28 &&
847       a_mode != 43 && a_mode != 5 && a_mode != 61 && a_mode != 62 &&
848       a_mode != 63 && a_mode != 66 && a_mode != 67) {
849     cout << "error! either plink binary files, or bimbam mean"
850          << "genotype files, or gene expression files are required." << endl;
851     error = true;
852   }
853 
854   if (file_pheno.empty() && (a_mode == 43 || a_mode == 5)) {
855     cout << "error! phenotype file is required." << endl;
856     error = true;
857   }
858 
859   if (a_mode == 61 || a_mode == 62) {
860     if (!file_beta.empty()) {
861       if (file_mbfile.empty() && file_bfile.empty() && file_mgeno.empty() &&
862           file_geno.empty() && file_mref.empty() && file_ref.empty()) {
863         cout << "error! missing genotype file or ref/mref file." << endl;
864         error = true;
865       }
866     } else if (!file_pheno.empty()) {
867       if (file_kin.empty() && (file_ku.empty() || file_kd.empty()) &&
868           file_mk.empty()) {
869         cout << "error! missing relatedness file. " << endl;
870         error = true;
871       }
872     } else if ((file_mstudy.empty() && file_study.empty()) ||
873                (file_mref.empty() && file_ref.empty())) {
874       cout << "error! either beta file, or phenotype files or "
875            << "study/ref mstudy/mref files are required." << endl;
876       error = true;
877     }
878   }
879 
880   if (a_mode == 63) {
881     if (file_kin.empty() && (file_ku.empty() || file_kd.empty()) &&
882         file_mk.empty()) {
883       cout << "error! missing relatedness file. " << endl;
884       error = true;
885     }
886     if (file_pheno.empty()) {
887       cout << "error! missing phenotype file." << endl;
888       error = true;
889     }
890   }
891 
892   if (a_mode == 66 || a_mode == 67) {
893     if (file_beta.empty() || (file_mbfile.empty() && file_bfile.empty() &&
894                               file_mgeno.empty() && file_geno.empty())) {
895       cout << "error! missing beta file or genotype file." << endl;
896       error = true;
897     }
898   }
899 
900   if (!file_epm.empty() && file_bfile.empty() && file_geno.empty()) {
901     cout << "error! estimated parameter file also requires genotype "
902          << "file." << endl;
903     error = true;
904   }
905   if (!file_ebv.empty() && file_kin.empty()) {
906     cout << "error! estimated breeding value file also requires "
907          << "relatedness file." << endl;
908     error = true;
909   }
910 
911   if (!file_log.empty() && pheno_mean != 0) {
912     cout << "error! either log file or mu value can be provide." << endl;
913     error = true;
914   }
915 
916   enforce_fexists(file_snps, "open file");
917   enforce_fexists(file_ksnps, "open file");
918   enforce_fexists(file_gwasnps, "open file");
919   enforce_fexists(file_anno, "open file");
920 
921   if (!loco.empty()) {
922     enforce_msg((a_mode >= 1 && a_mode <= 4) || a_mode == 21 || a_mode == 22,
923                 "LOCO only works with LMM and K");
924     // enforce_msg(file_bfile.empty(), "LOCO does not work with PLink (yet)");
925     enforce_msg(file_gxe.empty(), "LOCO does not support GXE (yet)");
926     enforce_msg(!file_anno.empty(),
927                 "LOCO requires annotation file (-a switch)");
928     enforce_msg(file_ksnps.empty(), "LOCO does not allow -ksnps switch");
929     enforce_msg(file_gwasnps.empty(), "LOCO does not allow -gwasnps switch");
930   }
931 
932   enforce_fexists(file_kin, "open file");
933   enforce_fexists(file_mk, "open file");
934   enforce_fexists(file_cvt, "open file");
935   enforce_fexists(file_gxe, "open file");
936   enforce_fexists(file_log, "open file");
937   enforce_fexists(file_weight, "open file");
938   enforce_fexists(file_epm, "open file");
939   enforce_fexists(file_ebv, "open file");
940   enforce_fexists(file_read, "open file");
941 
942   // Check if files are compatible with analysis mode.
943   if (k_mode == 2 && !file_geno.empty()) {
944     cout << "error! use \"-km 1\" when using bimbam mean genotype "
945          << "file. " << endl;
946     error = true;
947   }
948 
949   if ((a_mode == 1 || a_mode == 2 || a_mode == 3 || a_mode == 4 ||
950        a_mode == 5 || a_mode == 31) &&
951       (file_kin.empty() && (file_ku.empty() || file_kd.empty()))) {
952     cout << "error! missing relatedness file. " << endl;
953     error = true;
954   }
955 
956   if ((a_mode == 43) && file_kin.empty()) {
957     cout << "error! missing relatedness file. -predict option requires "
958          << "-k option to provide a relatedness file." << endl;
959     error = true;
960   }
961 
962   if ((a_mode == 11 || a_mode == 12 || a_mode == 13 || a_mode == 14 ||
963        a_mode == 16) &&
964       !file_cvt.empty()) {
965     cout << "error! -bslmm option does not support covariates files." << endl;
966     error = true;
967   }
968 
969   if (a_mode == 41 || a_mode == 42) {
970     if (!file_cvt.empty()) {
971       cout << "error! -predict option does not support "
972            << "covariates files." << endl;
973       error = true;
974     }
975     if (file_epm.empty()) {
976       cout << "error! -predict option requires estimated "
977            << "parameter files." << endl;
978       error = true;
979     }
980   }
981 
982   if (file_beta.empty() && (a_mode == 27 || a_mode == 28)) {
983     cout << "error! beta effects file is required." << endl;
984     error = true;
985   }
986 
987   return;
988 }
989 
CheckData(void)990 void PARAM::CheckData(void) {
991 
992   if ((a_mode == 66 || a_mode == 67) && (v_pve.size() != n_vc)) {
993     cout << "error! the number of pve estimates does not equal to "
994          << "the number of categories in the cat file:" << v_pve.size() << " "
995          << n_vc << endl;
996     error = true;
997   }
998 
999   if ((indicator_cvt).size() != 0 &&
1000       (indicator_cvt).size() != (indicator_idv).size()) {
1001     error = true;
1002     cout << "error! number of rows in the covariates file do not "
1003          << "match the number of individuals. " << indicator_cvt.size() << endl;
1004     return;
1005   }
1006   if ((indicator_gxe).size() != 0 &&
1007       (indicator_gxe).size() != (indicator_idv).size()) {
1008     error = true;
1009     cout << "error! number of rows in the gxe file do not match the number "
1010          << "of individuals. " << endl;
1011     return;
1012   }
1013   if ((indicator_weight).size() != 0 &&
1014       (indicator_weight).size() != (indicator_idv).size()) {
1015     error = true;
1016     cout << "error! number of rows in the weight file do not match "
1017          << "the number of individuals. " << endl;
1018     return;
1019   }
1020 
1021   if ((indicator_read).size() != 0 &&
1022       (indicator_read).size() != (indicator_idv).size()) {
1023     error = true;
1024     cout << "error! number of rows in the total read file do not "
1025          << "match the number of individuals. " << endl;
1026     return;
1027   }
1028 
1029   // Calculate ni_total and ni_test, and set indicator_idv to 0
1030   // whenever indicator_cvt=0, and calculate np_obs and np_miss.
1031   ni_total = (indicator_idv).size();
1032 
1033   ni_test = 0;
1034   for (vector<int>::size_type i = 0; i < (indicator_idv).size(); ++i) {
1035     if (indicator_idv[i] == 0) {
1036       continue;
1037     }
1038     ni_test++;
1039   }
1040 
1041   ni_cvt = 0;
1042   for (size_t i = 0; i < indicator_cvt.size(); i++) {
1043     if (indicator_cvt[i] == 0) {
1044       continue;
1045     }
1046     ni_cvt++;
1047   }
1048 
1049   np_obs = 0;
1050   np_miss = 0;
1051   for (size_t i = 0; i < indicator_pheno.size(); i++) {
1052     if (indicator_cvt.size() != 0) {
1053       if (indicator_cvt[i] == 0) {
1054         continue;
1055       }
1056     }
1057 
1058     if (indicator_gxe.size() != 0) {
1059       if (indicator_gxe[i] == 0) {
1060         continue;
1061       }
1062     }
1063 
1064     if (indicator_weight.size() != 0) {
1065       if (indicator_weight[i] == 0) {
1066         continue;
1067       }
1068     }
1069 
1070     for (size_t j = 0; j < indicator_pheno[i].size(); j++) {
1071       if (indicator_pheno[i][j] == 0) {
1072         np_miss++;
1073       } else {
1074         np_obs++;
1075       }
1076     }
1077   }
1078 
1079   enforce_msg(ni_test,"number of analyzed individuals equals 0.");
1080 
1081   if (ni_test == 0 && file_cor.empty() && file_mstudy.empty() &&
1082       file_study.empty() && file_beta.empty() && file_bf.empty()) {
1083     error = true;
1084     cout << "error! number of analyzed individuals equals 0. " << endl;
1085     return;
1086   }
1087 
1088   if (a_mode == 43) {
1089     if (ni_cvt == ni_test) {
1090       error = true;
1091       cout << "error! no individual has missing "
1092            << "phenotypes." << endl;
1093       return;
1094     }
1095     if ((np_obs + np_miss) != (ni_cvt * n_ph)) {
1096       error = true;
1097       cout << "error! number of phenotypes do not match the "
1098            << "summation of missing and observed phenotypes." << endl;
1099       return;
1100     }
1101   }
1102 
1103   // Output some information.
1104   if (file_cor.empty() && file_mstudy.empty() && file_study.empty() &&
1105       a_mode != 15 && a_mode != 27 && a_mode != 28) {
1106     cout << "## number of total individuals = " << ni_total << endl;
1107     if (a_mode == 43) {
1108       cout << "## number of analyzed individuals = " << ni_cvt << endl;
1109       cout << "## number of individuals with full phenotypes = " << ni_test
1110            << endl;
1111     } else {
1112       cout << "## number of analyzed individuals = " << ni_test << endl;
1113     }
1114     cout << "## number of covariates = " << n_cvt << endl;
1115     cout << "## number of phenotypes = " << n_ph << endl;
1116     if (a_mode == 43) {
1117       cout << "## number of observed data = " << np_obs << endl;
1118       cout << "## number of missing data = " << np_miss << endl;
1119     }
1120     if (!file_gene.empty()) {
1121       cout << "## number of total genes = " << ng_total << endl;
1122     } else if (file_epm.empty() && a_mode != 43 && a_mode != 5) {
1123       if (!loco.empty())
1124         cout << "## leave one chromosome out (LOCO) = " << setw(8) << loco << endl;
1125       cout << "## number of total SNPs/var        = " << setw(8) << ns_total << endl;
1126       if (setSnps.size())
1127         cout << "## number of considered SNPS       = " << setw(8) << setSnps.size() << endl;
1128       if (setKSnps.size())
1129         cout << "## number of SNPS for K            = " << setw(8) << setKSnps.size() << endl;
1130       if (setGWASnps.size())
1131         cout << "## number of SNPS for GWAS         = " << setw(8) << setGWASnps.size() << endl;
1132       cout << "## number of analyzed SNPs         = " << setw(8) << ns_test << endl;
1133     } else {
1134     }
1135   }
1136 
1137   // Set d_pace to 1000 for gene expression.
1138   if (!file_gene.empty() && d_pace == DEFAULT_PACE) {
1139     d_pace = 1000;
1140   }
1141 
1142   // For case-control studies, count # cases and # controls.
1143   int flag_cc = 0;
1144   if (a_mode == 13) {
1145     ni_case = 0;
1146     ni_control = 0;
1147     for (size_t i = 0; i < indicator_idv.size(); i++) {
1148       if (indicator_idv[i] == 0) {
1149         continue;
1150       }
1151 
1152       if (pheno[i][0] == 0) {
1153         ni_control++;
1154       } else if (pheno[i][0] == 1) {
1155         ni_case++;
1156       } else {
1157         flag_cc = 1;
1158       }
1159     }
1160     cout << "## number of cases = " << ni_case << endl;
1161     cout << "## number of controls = " << ni_control << endl;
1162   }
1163 
1164   if (flag_cc == 1) {
1165     cout << "Unexpected non-binary phenotypes for "
1166          << "case/control analysis. Use default (BSLMM) analysis instead."
1167          << endl;
1168     a_mode = 11;
1169   }
1170 
1171   // Set parameters for BSLMM and check for predict.
1172   if (a_mode == 11 || a_mode == 12 || a_mode == 13 || a_mode == 14) {
1173     if (a_mode == 11) {
1174       n_mh = 1;
1175     }
1176     if (logp_min == 0) {
1177       logp_min = -1.0 * log((double)ns_test);
1178     }
1179 
1180     if (h_scale == -1) {
1181       h_scale = min(1.0, 10.0 / sqrt((double)ni_test));
1182     }
1183     if (rho_scale == -1) {
1184       rho_scale = min(1.0, 10.0 / sqrt((double)ni_test));
1185     }
1186     if (logp_scale == -1) {
1187       logp_scale = min(1.0, 5.0 / sqrt((double)ni_test));
1188     }
1189 
1190     if (h_min == -1) {
1191       h_min = 0.0;
1192     }
1193     if (h_max == -1) {
1194       h_max = 1.0;
1195     }
1196 
1197     if (s_max > ns_test) {
1198       s_max = ns_test;
1199       cout << "s_max is re-set to the number of analyzed SNPs." << endl;
1200     }
1201     if (s_max < s_min) {
1202       cout << "error! maximum s value must be larger than the "
1203            << "minimal value. current values = " << s_max << " and " << s_min
1204            << endl;
1205       error = true;
1206     }
1207   } else if (a_mode == 41 || a_mode == 42) {
1208     if (indicator_bv.size() != 0) {
1209       if (indicator_idv.size() != indicator_bv.size()) {
1210         cout << "error! number of rows in the "
1211              << "phenotype file does not match that in the "
1212              << "estimated breeding value file: " << indicator_idv.size()
1213              << "\t" << indicator_bv.size() << endl;
1214         error = true;
1215       } else {
1216         size_t flag_bv = 0;
1217         for (size_t i = 0; i < (indicator_bv).size(); ++i) {
1218           if (indicator_idv[i] != indicator_bv[i]) {
1219             flag_bv++;
1220           }
1221         }
1222         if (flag_bv != 0) {
1223           cout << "error! individuals with missing value in the "
1224                << "phenotype file does not match that in the "
1225                << "estimated breeding value file: " << flag_bv << endl;
1226           error = true;
1227         }
1228       }
1229     }
1230   }
1231 
1232   if (a_mode == 62 && !file_beta.empty() && mapRS2wcat.size() == 0) {
1233     cout << "vc analysis with beta files requires -wcat file." << endl;
1234     error = true;
1235   }
1236   if (a_mode == 67 && mapRS2wcat.size() == 0) {
1237     cout << "ci analysis with beta files requires -wcat file." << endl;
1238     error = true;
1239   }
1240 
1241   // File_mk needs to contain more than one line.
1242   if (n_vc == 1 && !file_mk.empty()) {
1243     cout << "error! -mk file should contain more than one line." << endl;
1244     error = true;
1245   }
1246 
1247   return;
1248 }
1249 
PrintSummary()1250 void PARAM::PrintSummary() {
1251   if (n_ph == 1) {
1252     cout << "pve estimate =" << pve_null << endl;
1253     cout << "se(pve) =" << pve_se_null << endl;
1254   } else {
1255   }
1256   return;
1257 }
1258 
ReadGenotypes(gsl_matrix * UtX,gsl_matrix * K,const bool calc_K)1259 void PARAM::ReadGenotypes(gsl_matrix *UtX, gsl_matrix *K, const bool calc_K) {
1260   string file_str;
1261 
1262   if (!file_bfile.empty()) {
1263     file_str = file_bfile + ".bed";
1264     if (ReadFile_bed(file_str, indicator_idv, indicator_snp, UtX, K, calc_K) ==
1265         false) {
1266       error = true;
1267     }
1268   } else {
1269     if (ReadFile_geno(file_geno, indicator_idv, indicator_snp, UtX, K,
1270                       calc_K) == false) {
1271       error = true;
1272     }
1273   }
1274 
1275   return;
1276 }
1277 
ReadGenotypes(vector<vector<unsigned char>> & Xt,gsl_matrix * K,const bool calc_K)1278 void PARAM::ReadGenotypes(vector<vector<unsigned char>> &Xt, gsl_matrix *K,
1279                           const bool calc_K) {
1280   string file_str;
1281 
1282   if (!file_bfile.empty()) {
1283     file_str = file_bfile + ".bed";
1284     if (ReadFile_bed(file_str, indicator_idv, indicator_snp, Xt, K, calc_K,
1285                      ni_test, ns_test) == false) {
1286       error = true;
1287     }
1288   } else {
1289     if (ReadFile_geno(file_geno, indicator_idv, indicator_snp, Xt, K, calc_K,
1290                       ni_test, ns_test) == false) {
1291       error = true;
1292     }
1293   }
1294 
1295   return;
1296 }
1297 
CalcKin(gsl_matrix * matrix_kin)1298 void PARAM::CalcKin(gsl_matrix *matrix_kin) {
1299   string file_str;
1300 
1301   gsl_matrix_set_zero(matrix_kin);
1302 
1303   if (!file_bfile.empty()) {
1304     file_str = file_bfile + ".bed";
1305     // enforce_msg(loco.empty(), "FIXME: LOCO nyi");
1306     if (PlinkKin(file_str, indicator_snp, a_mode - 20, d_pace, matrix_kin) ==
1307         false) {
1308       error = true;
1309     }
1310   } else {
1311     file_str = file_geno;
1312     if (BimbamKin(file_str, setKSnps, indicator_snp, a_mode - 20, d_pace,
1313                   matrix_kin, ni_max == 0) == false) {
1314       error = true;
1315     }
1316   }
1317 
1318   return;
1319 }
1320 
1321 // From an existing n by nd A and K matrices, compute the d by d S
1322 // matrix (which is not necessary symmetric).
compAKtoS(const gsl_matrix * A,const gsl_matrix * K,const size_t n_cvt,gsl_matrix * S)1323 void compAKtoS(const gsl_matrix *A, const gsl_matrix *K, const size_t n_cvt,
1324                gsl_matrix *S) {
1325   size_t n_vc = S->size1, ni_test = A->size1;
1326   double di, dj, tr_AK, sum_A, sum_K, s_A, s_K, sum_AK, tr_A, tr_K, d;
1327 
1328   for (size_t i = 0; i < n_vc; i++) {
1329     for (size_t j = 0; j < n_vc; j++) {
1330       tr_AK = 0;
1331       sum_A = 0;
1332       sum_K = 0;
1333       sum_AK = 0;
1334       tr_A = 0;
1335       tr_K = 0;
1336       for (size_t l = 0; l < ni_test; l++) {
1337         s_A = 0;
1338         s_K = 0;
1339         for (size_t k = 0; k < ni_test; k++) {
1340           di = gsl_matrix_get(A, l, k + ni_test * i);
1341           dj = gsl_matrix_get(K, l, k + ni_test * j);
1342           s_A += di;
1343           s_K += dj;
1344 
1345           tr_AK += di * dj;
1346           sum_A += di;
1347           sum_K += dj;
1348           if (l == k) {
1349             tr_A += di;
1350             tr_K += dj;
1351           }
1352         }
1353         sum_AK += s_A * s_K;
1354       }
1355 
1356       sum_A /= (double)ni_test;
1357       sum_K /= (double)ni_test;
1358       sum_AK /= (double)ni_test;
1359       tr_A -= sum_A;
1360       tr_K -= sum_K;
1361       d = tr_AK - 2 * sum_AK + sum_A * sum_K;
1362 
1363       if (tr_A == 0 || tr_K == 0) {
1364         d = 0;
1365       } else {
1366         assert((tr_A * tr_K) - 1 != 0);
1367         assert(ni_test - n_cvt != 0);
1368         d = d / (tr_A * tr_K) - 1 / (double)(ni_test - n_cvt);
1369       }
1370 
1371       gsl_matrix_set(S, i, j, d);
1372     }
1373   }
1374 
1375   return;
1376 }
1377 
1378 /*
1379 // Copied from lmm.cpp; is used in the following function compKtoV
1380 // map a number 1..(n_cvt+2) to an index between 0 and [(n_cvt+2)*2+(n_cvt+2)]/2-1
1381 // or 1..cols to 0..(cols*2+cols)/2-1.
1382 
1383   For a 3x3 matrix the following index gets returned to CalcPab:
1384 
1385   CalcPab
1386   * 1,1:0
1387   * 1,2:1
1388   * 1,3:2
1389   * 2,2:5
1390   * 2,3:6
1391   * 3,3:9
1392 
1393 which is really the iteration moving forward along the diagonal and
1394 items to the right of it.
1395   */
1396 
1397 
GetabIndex(const size_t a,const size_t b,const size_t n_cvt)1398 size_t GetabIndex(const size_t a, const size_t b, const size_t n_cvt) {
1399   auto cols = n_cvt + 2;
1400   enforce_msg(a<=cols && b<=cols,"GetabIndex problem");
1401   size_t a1 = a, b1 = b;
1402   if (b <= a) {
1403     a1 = b;
1404     b1 = a;
1405   }
1406 
1407   size_t index = (2 * cols - a1 + 2) * (a1 - 1) / 2 + b1 - a1;
1408   // cout << "* GetabIndx " << a1 << "," << b1 << "," << cols << ":" << index << endl;
1409   // FIXME: should add a contract for index range
1410   return index;
1411   // return ( b < a ?  ((2 * cols - b + 2) * (b - 1) / 2 + a - b ): ((2 * cols - a + 2) * (a - 1) / 2 + b - a) );
1412 
1413 }
1414 
1415 // From an existing n by nd (centered) G matrix, compute the d+1 by
1416 // d*(d-1)/2*(d+1) Q matrix where inside i'th d+1 by d+1 matrix, each
1417 // element is tr(KiKlKjKm)-r*tr(KmKiKl)-r*tr(KlKjKm)+r^2*tr(KlKm),
1418 // where r=n/(n-1)
compKtoV(const gsl_matrix * G,gsl_matrix * V)1419 void compKtoV(const gsl_matrix *G, gsl_matrix *V) {
1420   size_t n_vc = G->size2 / G->size1, ni_test = G->size1;
1421 
1422   gsl_matrix *KiKj =
1423       gsl_matrix_alloc(ni_test, (n_vc * (n_vc + 1)) / 2 * ni_test);
1424   gsl_vector *trKiKj = gsl_vector_alloc(n_vc * (n_vc + 1) / 2);
1425   gsl_vector *trKi = gsl_vector_alloc(n_vc);
1426 
1427   assert(ni_test != 1);
1428   double d, tr, r = (double)ni_test / (double)(ni_test - 1);
1429   size_t t, t_il, t_jm, t_lm, t_im, t_jl, t_ij;
1430 
1431   // Compute KiKj for all pairs of i and j (not including the identity
1432   // matrix).
1433   t = 0;
1434   for (size_t i = 0; i < n_vc; i++) {
1435     gsl_matrix_const_view Ki =
1436         gsl_matrix_const_submatrix(G, 0, i * ni_test, ni_test, ni_test);
1437     for (size_t j = i; j < n_vc; j++) {
1438       gsl_matrix_const_view Kj =
1439           gsl_matrix_const_submatrix(G, 0, j * ni_test, ni_test, ni_test);
1440       gsl_matrix_view KiKj_sub =
1441           gsl_matrix_submatrix(KiKj, 0, t * ni_test, ni_test, ni_test);
1442       fast_dgemm("N", "N", 1.0, &Ki.matrix, &Kj.matrix, 0.0,
1443                      &KiKj_sub.matrix);
1444       t++;
1445     }
1446   }
1447 
1448   // Compute trKi, trKiKj.
1449   t = 0;
1450   for (size_t i = 0; i < n_vc; i++) {
1451     for (size_t j = i; j < n_vc; j++) {
1452       tr = 0;
1453       for (size_t k = 0; k < ni_test; k++) {
1454         tr += gsl_matrix_get(KiKj, k, t * ni_test + k);
1455       }
1456       gsl_vector_set(trKiKj, t, tr);
1457 
1458       t++;
1459     }
1460 
1461     tr = 0;
1462     for (size_t k = 0; k < ni_test; k++) {
1463       tr += gsl_matrix_get(G, k, i * ni_test + k);
1464     }
1465     gsl_vector_set(trKi, i, tr);
1466   }
1467 
1468   // Compute V.
1469   for (size_t i = 0; i < n_vc; i++) {
1470     for (size_t j = i; j < n_vc; j++) {
1471       t_ij = GetabIndex(i + 1, j + 1, n_vc - 2);
1472       for (size_t l = 0; l < n_vc + 1; l++) {
1473         for (size_t m = 0; m < n_vc + 1; m++) {
1474           if (l != n_vc && m != n_vc) {
1475             t_il = GetabIndex(i + 1, l + 1, n_vc - 2);
1476             t_jm = GetabIndex(j + 1, m + 1, n_vc - 2);
1477             t_lm = GetabIndex(l + 1, m + 1, n_vc - 2);
1478             tr = 0;
1479             for (size_t k = 0; k < ni_test; k++) {
1480               gsl_vector_const_view KiKl_row =
1481                   gsl_matrix_const_subrow(KiKj, k, t_il * ni_test, ni_test);
1482               gsl_vector_const_view KiKl_col =
1483                   gsl_matrix_const_column(KiKj, t_il * ni_test + k);
1484               gsl_vector_const_view KjKm_row =
1485                   gsl_matrix_const_subrow(KiKj, k, t_jm * ni_test, ni_test);
1486               gsl_vector_const_view KjKm_col =
1487                   gsl_matrix_const_column(KiKj, t_jm * ni_test + k);
1488 
1489               gsl_vector_const_view Kl_row =
1490                   gsl_matrix_const_subrow(G, k, l * ni_test, ni_test);
1491               gsl_vector_const_view Km_row =
1492                   gsl_matrix_const_subrow(G, k, m * ni_test, ni_test);
1493 
1494               if (i <= l && j <= m) {
1495                 gsl_blas_ddot(&KiKl_row.vector, &KjKm_col.vector, &d);
1496                 tr += d;
1497                 gsl_blas_ddot(&Km_row.vector, &KiKl_col.vector, &d);
1498                 tr -= r * d;
1499                 gsl_blas_ddot(&Kl_row.vector, &KjKm_col.vector, &d);
1500                 tr -= r * d;
1501               } else if (i <= l && j > m) {
1502                 gsl_blas_ddot(&KiKl_row.vector, &KjKm_row.vector, &d);
1503                 tr += d;
1504                 gsl_blas_ddot(&Km_row.vector, &KiKl_col.vector, &d);
1505                 tr -= r * d;
1506                 gsl_blas_ddot(&Kl_row.vector, &KjKm_row.vector, &d);
1507                 tr -= r * d;
1508               } else if (i > l && j <= m) {
1509                 gsl_blas_ddot(&KiKl_col.vector, &KjKm_col.vector, &d);
1510                 tr += d;
1511                 gsl_blas_ddot(&Km_row.vector, &KiKl_row.vector, &d);
1512                 tr -= r * d;
1513                 gsl_blas_ddot(&Kl_row.vector, &KjKm_col.vector, &d);
1514                 tr -= r * d;
1515               } else {
1516                 gsl_blas_ddot(&KiKl_col.vector, &KjKm_row.vector, &d);
1517                 tr += d;
1518                 gsl_blas_ddot(&Km_row.vector, &KiKl_row.vector, &d);
1519                 tr -= r * d;
1520                 gsl_blas_ddot(&Kl_row.vector, &KjKm_row.vector, &d);
1521                 tr -= r * d;
1522               }
1523             }
1524 
1525             tr += r * r * gsl_vector_get(trKiKj, t_lm);
1526           } else if (l != n_vc && m == n_vc) {
1527             t_il = GetabIndex(i + 1, l + 1, n_vc - 2);
1528             t_jl = GetabIndex(j + 1, l + 1, n_vc - 2);
1529             tr = 0;
1530             for (size_t k = 0; k < ni_test; k++) {
1531               gsl_vector_const_view KiKl_row =
1532                   gsl_matrix_const_subrow(KiKj, k, t_il * ni_test, ni_test);
1533               gsl_vector_const_view KiKl_col =
1534                   gsl_matrix_const_column(KiKj, t_il * ni_test + k);
1535               gsl_vector_const_view Kj_row =
1536                   gsl_matrix_const_subrow(G, k, j * ni_test, ni_test);
1537 
1538               if (i <= l) {
1539                 gsl_blas_ddot(&KiKl_row.vector, &Kj_row.vector, &d);
1540                 tr += d;
1541               } else {
1542                 gsl_blas_ddot(&KiKl_col.vector, &Kj_row.vector, &d);
1543                 tr += d;
1544               }
1545             }
1546             tr += -r * gsl_vector_get(trKiKj, t_il) -
1547                   r * gsl_vector_get(trKiKj, t_jl) +
1548                   r * r * gsl_vector_get(trKi, l);
1549           } else if (l == n_vc && m != n_vc) {
1550             t_jm = GetabIndex(j + 1, m + 1, n_vc - 2);
1551             t_im = GetabIndex(i + 1, m + 1, n_vc - 2);
1552             tr = 0;
1553             for (size_t k = 0; k < ni_test; k++) {
1554               gsl_vector_const_view KjKm_row =
1555                   gsl_matrix_const_subrow(KiKj, k, t_jm * ni_test, ni_test);
1556               gsl_vector_const_view KjKm_col =
1557                   gsl_matrix_const_column(KiKj, t_jm * ni_test + k);
1558               gsl_vector_const_view Ki_row =
1559                   gsl_matrix_const_subrow(G, k, i * ni_test, ni_test);
1560 
1561               if (j <= m) {
1562                 gsl_blas_ddot(&KjKm_row.vector, &Ki_row.vector, &d);
1563                 tr += d;
1564               } else {
1565                 gsl_blas_ddot(&KjKm_col.vector, &Ki_row.vector, &d);
1566                 tr += d;
1567               }
1568             }
1569             tr += -r * gsl_vector_get(trKiKj, t_im) -
1570                   r * gsl_vector_get(trKiKj, t_jm) +
1571                   r * r * gsl_vector_get(trKi, m);
1572           } else {
1573             tr = gsl_vector_get(trKiKj, t_ij) - r * gsl_vector_get(trKi, i) -
1574                  r * gsl_vector_get(trKi, j) + r * r * (double)(ni_test - 1);
1575           }
1576 
1577           gsl_matrix_set(V, l, t_ij * (n_vc + 1) + m, tr);
1578         }
1579       }
1580     }
1581   }
1582 
1583   assert(ni_test != 0);
1584   gsl_matrix_scale(V, 1.0 / pow((double)ni_test, 2));
1585 
1586   gsl_matrix_free(KiKj);
1587   gsl_vector_free(trKiKj);
1588   gsl_vector_free(trKi);
1589 
1590   return;
1591 }
1592 
1593 // Perform Jacknife sampling for variance of S.
JackknifeAKtoS(const gsl_matrix * W,const gsl_matrix * A,const gsl_matrix * K,gsl_matrix * S,gsl_matrix * Svar)1594 void JackknifeAKtoS(const gsl_matrix *W, const gsl_matrix *A,
1595                     const gsl_matrix *K, gsl_matrix *S, gsl_matrix *Svar) {
1596   size_t n_vc = Svar->size1, ni_test = A->size1, n_cvt = W->size2;
1597 
1598   vector<vector<vector<double>>> trAK, sumAK;
1599   vector<vector<double>> sumA, sumK, trA, trK, sA, sK;
1600   vector<double> vec_tmp;
1601   double di, dj, d, m, v;
1602 
1603   // Initialize and set all elements to zero.
1604   for (size_t i = 0; i < ni_test; i++) {
1605     vec_tmp.push_back(0);
1606   }
1607 
1608   for (size_t i = 0; i < n_vc; i++) {
1609     sumA.push_back(vec_tmp);
1610     sumK.push_back(vec_tmp);
1611     trA.push_back(vec_tmp);
1612     trK.push_back(vec_tmp);
1613     sA.push_back(vec_tmp);
1614     sK.push_back(vec_tmp);
1615   }
1616 
1617   for (size_t i = 0; i < n_vc; i++) {
1618     trAK.push_back(sumK);
1619     sumAK.push_back(sumK);
1620   }
1621 
1622   // Run jackknife.
1623   for (size_t i = 0; i < n_vc; i++) {
1624     for (size_t l = 0; l < ni_test; l++) {
1625       for (size_t k = 0; k < ni_test; k++) {
1626         di = gsl_matrix_get(A, l, k + ni_test * i);
1627         dj = gsl_matrix_get(K, l, k + ni_test * i);
1628 
1629         for (size_t t = 0; t < ni_test; t++) {
1630           if (t == l || t == k) {
1631             continue;
1632           }
1633           sumA[i][t] += di;
1634           sumK[i][t] += dj;
1635           if (l == k) {
1636             trA[i][t] += di;
1637             trK[i][t] += dj;
1638           }
1639         }
1640         sA[i][l] += di;
1641         sK[i][l] += dj;
1642       }
1643     }
1644 
1645     for (size_t t = 0; t < ni_test; t++) {
1646       assert(ni_test != 1);
1647       sumA[i][t] /= (double)(ni_test - 1);
1648       sumK[i][t] /= (double)(ni_test - 1);
1649     }
1650   }
1651 
1652   for (size_t i = 0; i < n_vc; i++) {
1653     for (size_t j = 0; j < n_vc; j++) {
1654       for (size_t l = 0; l < ni_test; l++) {
1655         for (size_t k = 0; k < ni_test; k++) {
1656           di = gsl_matrix_get(A, l, k + ni_test * i);
1657           dj = gsl_matrix_get(K, l, k + ni_test * j);
1658           d = di * dj;
1659 
1660           for (size_t t = 0; t < ni_test; t++) {
1661             if (t == l || t == k) {
1662               continue;
1663             }
1664             trAK[i][j][t] += d;
1665           }
1666         }
1667 
1668         for (size_t t = 0; t < ni_test; t++) {
1669           if (t == l) {
1670             continue;
1671           }
1672           di = gsl_matrix_get(A, l, t + ni_test * i);
1673           dj = gsl_matrix_get(K, l, t + ni_test * j);
1674 
1675           sumAK[i][j][t] += (sA[i][l] - di) * (sK[j][l] - dj);
1676         }
1677       }
1678 
1679       for (size_t t = 0; t < ni_test; t++) {
1680         assert(ni_test != 1);
1681         sumAK[i][j][t] /= (double)(ni_test - 1);
1682       }
1683 
1684       m = 0;
1685       v = 0;
1686       for (size_t t = 0; t < ni_test; t++) {
1687         d = trAK[i][j][t] - 2 * sumAK[i][j][t] + sumA[i][t] * sumK[j][t];
1688         if ((trA[i][t] - sumA[i][t]) == 0 || (trK[j][t] - sumK[j][t]) == 0) {
1689           d = 0;
1690         } else {
1691           d /= (trA[i][t] - sumA[i][t]) * (trK[j][t] - sumK[j][t]);
1692           d -= 1 / (double)(ni_test - n_cvt - 1);
1693         }
1694         m += d;
1695         v += d * d;
1696       }
1697       m /= (double)ni_test;
1698       v /= (double)ni_test;
1699       v -= m * m;
1700       v *= (double)(ni_test - 1);
1701       gsl_matrix_set(Svar, i, j, v);
1702       if (n_cvt == 1) {
1703         d = gsl_matrix_get(S, i, j);
1704         d = (double)ni_test * d - (double)(ni_test - 1) * m;
1705         gsl_matrix_set(S, i, j, d);
1706       }
1707     }
1708   }
1709 
1710   return;
1711 }
1712 
1713 // Compute the d by d S matrix with its d by d variance matrix of
1714 // Svar, and the d+1 by d(d+1) matrix of Q for V(q).
CalcS(const map<string,double> & mapRS2wA,const map<string,double> & mapRS2wK,const gsl_matrix * W,gsl_matrix * A,gsl_matrix * K,gsl_matrix * S,gsl_matrix * Svar,gsl_vector * ns)1715 void PARAM::CalcS(const map<string, double> &mapRS2wA,
1716                   const map<string, double> &mapRS2wK, const gsl_matrix *W,
1717                   gsl_matrix *A, gsl_matrix *K, gsl_matrix *S, gsl_matrix *Svar,
1718                   gsl_vector *ns) {
1719   string file_str;
1720 
1721   gsl_matrix_set_zero(S);
1722   gsl_matrix_set_zero(Svar);
1723   gsl_vector_set_zero(ns);
1724 
1725   // Compute the kinship matrix G for multiple categories; these
1726   // matrices are not centered, for convienence of Jacknife sampling.
1727   if (!file_bfile.empty()) {
1728     file_str = file_bfile + ".bed";
1729     if (mapRS2wA.size() == 0) {
1730       if (PlinkKin(file_str, d_pace, indicator_idv, indicator_snp, mapRS2wK,
1731                    mapRS2cat, snpInfo, W, K, ns) == false) {
1732         error = true;
1733       }
1734     } else {
1735       if (PlinkKin(file_str, d_pace, indicator_idv, indicator_snp, mapRS2wA,
1736                    mapRS2cat, snpInfo, W, A, ns) == false) {
1737         error = true;
1738       }
1739     }
1740   } else if (!file_geno.empty()) {
1741     file_str = file_geno;
1742     if (mapRS2wA.size() == 0) {
1743       if (BimbamKinUncentered(file_str, setKSnps, d_pace, indicator_idv,
1744                               indicator_snp, mapRS2wK, mapRS2cat, snpInfo, W, K,
1745                               ns) == false) {
1746         error = true;
1747       }
1748     } else {
1749       if (BimbamKinUncentered(file_str, setKSnps, d_pace, indicator_idv,
1750                               indicator_snp, mapRS2wA, mapRS2cat, snpInfo, W, A,
1751                               ns) == false) {
1752         error = true;
1753       }
1754     }
1755   } else if (!file_mbfile.empty()) {
1756     if (mapRS2wA.size() == 0) {
1757       if (MFILEKin(1, file_mbfile, setKSnps, d_pace, indicator_idv,
1758                    mindicator_snp, mapRS2wK, mapRS2cat, msnpInfo, W, K,
1759                    ns) == false) {
1760         error = true;
1761       }
1762     } else {
1763       if (MFILEKin(1, file_mbfile, setKSnps, d_pace, indicator_idv,
1764                    mindicator_snp, mapRS2wA, mapRS2cat, msnpInfo, W, A,
1765                    ns) == false) {
1766         error = true;
1767       }
1768     }
1769   } else if (!file_mgeno.empty()) {
1770     if (mapRS2wA.size() == 0) {
1771       if (MFILEKin(0, file_mgeno, setKSnps, d_pace, indicator_idv,
1772                    mindicator_snp, mapRS2wK, mapRS2cat, msnpInfo, W, K,
1773                    ns) == false) {
1774         error = true;
1775       }
1776     } else {
1777       if (MFILEKin(0, file_mgeno, setKSnps, d_pace, indicator_idv,
1778                    mindicator_snp, mapRS2wA, mapRS2cat, msnpInfo, W, A,
1779                    ns) == false) {
1780         error = true;
1781       }
1782     }
1783   }
1784 
1785   if (mapRS2wA.size() == 0) {
1786     gsl_matrix_memcpy(A, K);
1787   }
1788 
1789   // Center and scale every kinship matrix inside G.
1790   for (size_t i = 0; i < n_vc; i++) {
1791     gsl_matrix_view Ksub =
1792         gsl_matrix_submatrix(K, 0, i * ni_test, ni_test, ni_test);
1793     CenterMatrix(&Ksub.matrix);
1794     // Scale the matrix G such that the mean diagonal = 1.
1795     ScaleMatrix(&Ksub.matrix);
1796 
1797     gsl_matrix_view Asub =
1798         gsl_matrix_submatrix(A, 0, i * ni_test, ni_test, ni_test);
1799     CenterMatrix(&Asub.matrix);
1800     ScaleMatrix(&Asub.matrix);
1801   }
1802 
1803   // Cased on G, compute S.
1804   compAKtoS(A, K, W->size2, S);
1805 
1806   // Compute Svar and update S with Jacknife.
1807   JackknifeAKtoS(W, A, K, S, Svar);
1808 
1809   return;
1810 }
1811 
WriteVector(const gsl_vector * q,const gsl_vector * s,const size_t n_total,const string suffix)1812 void PARAM::WriteVector(const gsl_vector *q, const gsl_vector *s,
1813                         const size_t n_total, const string suffix) {
1814   string file_str;
1815   file_str = path_out + "/" + file_out;
1816   file_str += ".";
1817   file_str += suffix;
1818   file_str += ".txt";
1819 
1820   ofstream outfile(file_str.c_str(), ofstream::out);
1821   if (!outfile) {
1822     cout << "error writing file: " << file_str.c_str() << endl;
1823     return;
1824   }
1825 
1826   outfile.precision(10);
1827 
1828   for (size_t i = 0; i < q->size; ++i) {
1829     outfile << gsl_vector_get(q, i) << endl;
1830   }
1831 
1832   for (size_t i = 0; i < s->size; ++i) {
1833     outfile << gsl_vector_get(s, i) << endl;
1834   }
1835 
1836   outfile << n_total << endl;
1837 
1838   outfile.close();
1839   outfile.clear();
1840   return;
1841 }
1842 
WriteVar(const string suffix)1843 void PARAM::WriteVar(const string suffix) {
1844   string file_str, rs;
1845   file_str = path_out + "/" + file_out;
1846   file_str += ".";
1847   file_str += suffix;
1848   file_str += ".txt.gz";
1849 
1850   ogzstream outfile(file_str.c_str(), ogzstream::out);
1851   if (!outfile) {
1852     cout << "error writing file: " << file_str.c_str() << endl;
1853     return;
1854   }
1855 
1856   outfile.precision(10);
1857 
1858   if (mindicator_snp.size() != 0) {
1859     for (size_t t = 0; t < mindicator_snp.size(); t++) {
1860       indicator_snp = mindicator_snp[t];
1861       for (size_t i = 0; i < indicator_snp.size(); i++) {
1862         if (indicator_snp[i] == 0) {
1863           continue;
1864         }
1865         rs = snpInfo[i].rs_number;
1866         outfile << rs << endl;
1867       }
1868     }
1869   } else {
1870     for (size_t i = 0; i < indicator_snp.size(); i++) {
1871       if (indicator_snp[i] == 0) {
1872         continue;
1873       }
1874       rs = snpInfo[i].rs_number;
1875       outfile << rs << endl;
1876     }
1877   }
1878 
1879   outfile.close();
1880   outfile.clear();
1881   return;
1882 }
1883 
WriteMatrix(const gsl_matrix * matrix_U,const string suffix)1884 void PARAM::WriteMatrix(const gsl_matrix *matrix_U, const string suffix) {
1885   string file_str;
1886   file_str = path_out + "/" + file_out;
1887   file_str += ".";
1888   file_str += suffix;
1889   file_str += ".txt";
1890 
1891   ofstream outfile(file_str.c_str(), ofstream::out);
1892   if (!outfile) {
1893     cout << "error writing file: " << file_str.c_str() << endl;
1894     return;
1895   }
1896 
1897   outfile.precision(10);
1898 
1899   for (size_t i = 0; i < matrix_U->size1; ++i) {
1900     for (size_t j = 0; j < matrix_U->size2; ++j) {
1901       outfile << tab(j) << gsl_matrix_get(matrix_U, i, j);
1902     }
1903     outfile << endl;
1904   }
1905 
1906   outfile.close();
1907   outfile.clear();
1908   return;
1909 }
1910 
WriteVector(const gsl_vector * vector_D,const string suffix)1911 void PARAM::WriteVector(const gsl_vector *vector_D, const string suffix) {
1912   string file_str;
1913   file_str = path_out + "/" + file_out;
1914   file_str += ".";
1915   file_str += suffix;
1916   file_str += ".txt";
1917 
1918   ofstream outfile(file_str.c_str(), ofstream::out);
1919   if (!outfile) {
1920     cout << "error writing file: " << file_str.c_str() << endl;
1921     return;
1922   }
1923 
1924   outfile.precision(10);
1925 
1926   for (size_t i = 0; i < vector_D->size; ++i) {
1927     outfile << gsl_vector_get(vector_D, i) << endl;
1928   }
1929 
1930   outfile.close();
1931   outfile.clear();
1932   return;
1933 }
1934 
CheckCvt()1935 void PARAM::CheckCvt() {
1936   if (indicator_cvt.size() == 0) {
1937     return;
1938   }
1939 
1940   size_t ci_test = 0;
1941 
1942   gsl_matrix *W = gsl_matrix_alloc(ni_test, n_cvt);
1943 
1944   for (vector<int>::size_type i = 0; i < indicator_idv.size(); ++i) {
1945     if (indicator_idv[i] == 0 || indicator_cvt[i] == 0) {
1946       continue;
1947     }
1948     for (size_t j = 0; j < n_cvt; ++j) {
1949       gsl_matrix_set(W, ci_test, j, (cvt)[i][j]);
1950     }
1951     ci_test++;
1952   }
1953 
1954   size_t flag_ipt = 0;
1955   double v_min, v_max;
1956   set<size_t> set_remove;
1957 
1958   // Check if any columns is an intercept.
1959   for (size_t i = 0; i < W->size2; i++) {
1960     gsl_vector_view w_col = gsl_matrix_column(W, i);
1961     gsl_vector_minmax(&w_col.vector, &v_min, &v_max);
1962     if (v_min == v_max) {
1963       flag_ipt = 1;
1964       set_remove.insert(i);
1965     }
1966   }
1967 
1968   // Add an intercept term if needed.
1969   if (n_cvt == set_remove.size()) {
1970     indicator_cvt.clear();
1971     n_cvt = 1;
1972   } else if (flag_ipt == 0) {
1973     info_msg("no intercept term is found in the cvt file: a column of 1s is added");
1974     for (vector<int>::size_type i = 0; i < indicator_idv.size(); ++i) {
1975       if (indicator_idv[i] == 0 || indicator_cvt[i] == 0) {
1976         continue;
1977       }
1978       cvt[i].push_back(1.0);
1979     }
1980 
1981     n_cvt++;
1982   } else {
1983   }
1984 
1985   gsl_matrix_free(W);
1986 
1987   return;
1988 }
1989 
1990 // Post-process phenotypes and covariates.
ProcessCvtPhen()1991 void PARAM::ProcessCvtPhen() {
1992 
1993   // Convert indicator_pheno to indicator_idv.
1994   int k = 1;
1995   indicator_idv.clear();
1996   for (size_t i = 0; i < indicator_pheno.size(); i++) {
1997     k = 1;
1998     for (size_t j = 0; j < indicator_pheno[i].size(); j++) {
1999       if (indicator_pheno[i][j] == 0) {
2000         k = 0;
2001       }
2002     }
2003     indicator_idv.push_back(k);
2004   }
2005 
2006   // Remove individuals with missing covariates.
2007   if ((indicator_cvt).size() != 0) {
2008     for (vector<int>::size_type i = 0; i < (indicator_idv).size(); ++i) {
2009       indicator_idv[i] *= indicator_cvt[i];
2010     }
2011   }
2012 
2013   // Remove individuals with missing gxe variables.
2014   if ((indicator_gxe).size() != 0) {
2015     for (vector<int>::size_type i = 0; i < (indicator_idv).size(); ++i) {
2016       indicator_idv[i] *= indicator_gxe[i];
2017     }
2018   }
2019 
2020   // Remove individuals with missing residual weights.
2021   if ((indicator_weight).size() != 0) {
2022     for (vector<int>::size_type i = 0; i < (indicator_idv).size(); ++i) {
2023       indicator_idv[i] *= indicator_weight[i];
2024     }
2025   }
2026 
2027   // Obtain ni_test.
2028   ni_test = 0;
2029   for (vector<int>::size_type i = 0; i < (indicator_idv).size(); ++i) {
2030     if (indicator_idv[i] == 0) {
2031       continue;
2032     }
2033     ni_test++;
2034   }
2035 
2036   // If subsample number is set, perform a random sub-sampling
2037   // to determine the subsampled ids.
2038   if (ni_subsample != 0) {
2039     if (ni_test < ni_subsample) {
2040       cout << "error! number of subsamples is less than number of"
2041            << "analyzed individuals. " << endl;
2042     } else {
2043 
2044       // From ni_test, sub-sample ni_subsample.
2045       vector<size_t> a, b;
2046       for (size_t i = 0; i < ni_subsample; i++) {
2047         a.push_back(0);
2048       }
2049       for (size_t i = 0; i < ni_test; i++) {
2050         b.push_back(i);
2051       }
2052 
2053       gsl_ran_choose(gsl_r, static_cast<void *>(&a[0]), ni_subsample,
2054                      static_cast<void *>(&b[0]), ni_test, sizeof(size_t));
2055 
2056       // Re-set indicator_idv and ni_test.
2057       int j = 0;
2058       for (vector<int>::size_type i = 0; i < (indicator_idv).size(); ++i) {
2059         if (indicator_idv[i] == 0) {
2060           continue;
2061         }
2062         if (find(a.begin(), a.end(), j) == a.end()) {
2063           indicator_idv[i] = 0;
2064         }
2065         j++;
2066       }
2067       ni_test = ni_subsample;
2068     }
2069   }
2070 
2071   // Check ni_test.
2072   if (a_mode != M_BSLMM5)
2073     enforce_msg(ni_test,"number of analyzed individuals equals 0.");
2074   if (ni_test == 0 && a_mode != M_BSLMM5) {
2075     error = true;
2076     cout << "error! number of analyzed individuals equals 0. " << endl;
2077   }
2078 
2079   // Check covariates to see if they are correlated with each
2080   // other, and to see if the intercept term is included.
2081   // After getting ni_test.
2082   // Add or remove covariates.
2083   if (indicator_cvt.size() != 0) {
2084     CheckCvt();
2085   } else {
2086     vector<double> cvt_row;
2087     cvt_row.push_back(1);
2088 
2089     for (vector<int>::size_type i = 0; i < (indicator_idv).size(); ++i) {
2090       indicator_cvt.push_back(1);
2091       cvt.push_back(cvt_row);
2092     }
2093   }
2094 
2095   return;
2096 }
2097 
CopyCvt(gsl_matrix * W)2098 void PARAM::CopyCvt(gsl_matrix *W) {
2099   size_t ci_test = 0;
2100 
2101   for (vector<int>::size_type i = 0; i < indicator_idv.size(); ++i) {
2102     if (indicator_idv[i] == 0 || indicator_cvt[i] == 0) {
2103       continue;
2104     }
2105     for (size_t j = 0; j < n_cvt; ++j) {
2106       gsl_matrix_set(W, ci_test, j, (cvt)[i][j]);
2107     }
2108     ci_test++;
2109   }
2110 
2111   return;
2112 }
2113 
CopyGxe(gsl_vector * env)2114 void PARAM::CopyGxe(gsl_vector *env) {
2115   size_t ci_test = 0;
2116 
2117   for (vector<int>::size_type i = 0; i < indicator_idv.size(); ++i) {
2118     if (indicator_idv[i] == 0 || indicator_gxe[i] == 0) {
2119       continue;
2120     }
2121     gsl_vector_set(env, ci_test, gxe[i]);
2122     ci_test++;
2123   }
2124 
2125   return;
2126 }
2127 
CopyWeight(gsl_vector * w)2128 void PARAM::CopyWeight(gsl_vector *w) {
2129   size_t ci_test = 0;
2130 
2131   for (vector<int>::size_type i = 0; i < indicator_idv.size(); ++i) {
2132     if (indicator_idv[i] == 0 || indicator_weight[i] == 0) {
2133       continue;
2134     }
2135     gsl_vector_set(w, ci_test, weight[i]);
2136     ci_test++;
2137   }
2138 
2139   return;
2140 }
2141 
2142 // If flag=0, then use indicator_idv to load W and Y;
2143 // else, use indicator_cvt to load them.
CopyCvtPhen(gsl_matrix * W,gsl_vector * y,size_t flag)2144 void PARAM::CopyCvtPhen(gsl_matrix *W, gsl_vector *y, size_t flag) {
2145   size_t ci_test = 0;
2146 
2147   for (vector<int>::size_type i = 0; i < indicator_idv.size(); ++i) {
2148     if (flag == 0) {
2149       if (indicator_idv[i] == 0) {
2150         continue;
2151       }
2152     } else {
2153       if (indicator_cvt[i] == 0) {
2154         continue;
2155       }
2156     }
2157 
2158     gsl_vector_set(y, ci_test, (pheno)[i][0]);
2159 
2160     for (size_t j = 0; j < n_cvt; ++j) {
2161       gsl_matrix_set(W, ci_test, j, (cvt)[i][j]);
2162     }
2163     ci_test++;
2164   }
2165 
2166   return;
2167 }
2168 
2169 // If flag=0, then use indicator_idv to load W and Y;
2170 // else, use indicator_cvt to load them.
CopyCvtPhen(gsl_matrix * W,gsl_matrix * Y,size_t flag)2171 void PARAM::CopyCvtPhen(gsl_matrix *W, gsl_matrix *Y, size_t flag) {
2172   size_t ci_test = 0;
2173 
2174   for (vector<int>::size_type i = 0; i < indicator_idv.size(); ++i) {
2175     if (flag == 0) {
2176       if (indicator_idv[i] == 0) {
2177         continue;
2178       }
2179     } else {
2180       if (indicator_cvt[i] == 0) {
2181         continue;
2182       }
2183     }
2184 
2185     for (size_t j = 0; j < n_ph; ++j) {
2186       gsl_matrix_set(Y, ci_test, j, (pheno)[i][j]);
2187     }
2188     for (size_t j = 0; j < n_cvt; ++j) {
2189       gsl_matrix_set(W, ci_test, j, (cvt)[i][j]);
2190     }
2191 
2192     ci_test++;
2193   }
2194 
2195   return;
2196 }
2197 
CopyRead(gsl_vector * log_N)2198 void PARAM::CopyRead(gsl_vector *log_N) {
2199   size_t ci_test = 0;
2200 
2201   for (vector<int>::size_type i = 0; i < indicator_idv.size(); ++i) {
2202     if (indicator_idv[i] == 0) {
2203       continue;
2204     }
2205     gsl_vector_set(log_N, ci_test, log(vec_read[i]));
2206     ci_test++;
2207   }
2208 
2209   return;
2210 }
2211 
ObtainWeight(const set<string> & setSnps_beta,map<string,double> & mapRS2wK)2212 void PARAM::ObtainWeight(const set<string> &setSnps_beta,
2213                          map<string, double> &mapRS2wK) {
2214   mapRS2wK.clear();
2215 
2216   vector<double> wsum, wcount;
2217 
2218   for (size_t i = 0; i < n_vc; i++) {
2219     wsum.push_back(0.0);
2220     wcount.push_back(0.0);
2221   }
2222 
2223   string rs;
2224   if (msnpInfo.size() == 0) {
2225     for (size_t i = 0; i < snpInfo.size(); i++) {
2226       if (indicator_snp[i] == 0) {
2227         continue;
2228       }
2229 
2230       rs = snpInfo[i].rs_number;
2231       if ((setSnps_beta.size() == 0 || setSnps_beta.count(rs) != 0) &&
2232           (mapRS2wsnp.size() == 0 || mapRS2wsnp.count(rs) != 0) &&
2233           (mapRS2wcat.size() == 0 || mapRS2wcat.count(rs) != 0) &&
2234           (mapRS2cat.size() == 0 || mapRS2cat.count(rs) != 0)) {
2235         if (mapRS2wsnp.size() != 0) {
2236           mapRS2wK[rs] = mapRS2wsnp[rs];
2237           if (mapRS2cat.size() == 0) {
2238             wsum[0] += mapRS2wsnp[rs];
2239           } else {
2240             wsum[mapRS2cat[rs]] += mapRS2wsnp[rs];
2241           }
2242           wcount[0]++;
2243         } else {
2244           mapRS2wK[rs] = 1;
2245         }
2246       }
2247     }
2248   } else {
2249     for (size_t t = 0; t < msnpInfo.size(); t++) {
2250       snpInfo = msnpInfo[t];
2251       indicator_snp = mindicator_snp[t];
2252 
2253       for (size_t i = 0; i < snpInfo.size(); i++) {
2254         if (indicator_snp[i] == 0) {
2255           continue;
2256         }
2257 
2258         rs = snpInfo[i].rs_number;
2259         if ((setSnps_beta.size() == 0 || setSnps_beta.count(rs) != 0) &&
2260             (mapRS2wsnp.size() == 0 || mapRS2wsnp.count(rs) != 0) &&
2261             (mapRS2wcat.size() == 0 || mapRS2wcat.count(rs) != 0) &&
2262             (mapRS2cat.size() == 0 || mapRS2cat.count(rs) != 0)) {
2263           if (mapRS2wsnp.size() != 0) {
2264             mapRS2wK[rs] = mapRS2wsnp[rs];
2265             if (mapRS2cat.size() == 0) {
2266               wsum[0] += mapRS2wsnp[rs];
2267             } else {
2268               wsum[mapRS2cat[rs]] += mapRS2wsnp[rs];
2269             }
2270             wcount[0]++;
2271           } else {
2272             mapRS2wK[rs] = 1;
2273           }
2274         }
2275       }
2276     }
2277   }
2278 
2279   if (mapRS2wsnp.size() != 0) {
2280     for (size_t i = 0; i < n_vc; i++) {
2281       wsum[i] /= wcount[i];
2282     }
2283 
2284     for (map<string, double>::iterator it = mapRS2wK.begin();
2285          it != mapRS2wK.end(); ++it) {
2286       if (mapRS2cat.size() == 0) {
2287         it->second /= wsum[0];
2288       } else {
2289         it->second /= wsum[mapRS2cat[it->first]];
2290       }
2291     }
2292   }
2293   return;
2294 }
2295 
2296 // If pve_flag=0 then do not change pve; pve_flag==1, then change pve
2297 // to 0 if pve < 0 and pve to 1 if pve > 1.
UpdateWeight(const size_t pve_flag,const map<string,double> & mapRS2wK,const size_t ni_test,const gsl_vector * ns,map<string,double> & mapRS2wA)2298 void PARAM::UpdateWeight(const size_t pve_flag,
2299                          const map<string, double> &mapRS2wK,
2300                          const size_t ni_test, const gsl_vector *ns,
2301                          map<string, double> &mapRS2wA) {
2302   double d;
2303   vector<double> wsum, wcount;
2304 
2305   for (size_t i = 0; i < n_vc; i++) {
2306     wsum.push_back(0.0);
2307     wcount.push_back(0.0);
2308   }
2309 
2310   for (map<string, double>::const_iterator it = mapRS2wK.begin();
2311        it != mapRS2wK.end(); ++it) {
2312     d = 1;
2313     for (size_t i = 0; i < n_vc; i++) {
2314       if (v_pve[i] >= 1 && pve_flag == 1) {
2315         d += (double)ni_test / gsl_vector_get(ns, i) * mapRS2wcat[it->first][i];
2316       } else if (v_pve[i] <= 0 && pve_flag == 1) {
2317         d += 0;
2318       } else {
2319         d += (double)ni_test / gsl_vector_get(ns, i) *
2320              mapRS2wcat[it->first][i] * v_pve[i];
2321       }
2322     }
2323     mapRS2wA[it->first] = 1 / (d * d);
2324 
2325     if (mapRS2cat.size() == 0) {
2326       wsum[0] += mapRS2wA[it->first];
2327       wcount[0]++;
2328     } else {
2329       wsum[mapRS2cat[it->first]] += mapRS2wA[it->first];
2330       wcount[mapRS2cat[it->first]]++;
2331     }
2332   }
2333 
2334   for (size_t i = 0; i < n_vc; i++) {
2335     wsum[i] /= wcount[i];
2336   }
2337 
2338   for (map<string, double>::iterator it = mapRS2wA.begin();
2339        it != mapRS2wA.end(); ++it) {
2340     if (mapRS2cat.size() == 0) {
2341       it->second /= wsum[0];
2342     } else {
2343       it->second /= wsum[mapRS2cat[it->first]];
2344     }
2345   }
2346   return;
2347 }
2348 
2349 // This function updates indicator_snp, and save z-scores and other
2350 // values into vectors.
UpdateSNPnZ(const map<string,double> & mapRS2wA,const map<string,string> & mapRS2A1,const map<string,double> & mapRS2z,gsl_vector * w,gsl_vector * z,vector<size_t> & vec_cat)2351 void PARAM::UpdateSNPnZ(const map<string, double> &mapRS2wA,
2352                         const map<string, string> &mapRS2A1,
2353                         const map<string, double> &mapRS2z, gsl_vector *w,
2354                         gsl_vector *z, vector<size_t> &vec_cat) {
2355   gsl_vector_set_zero(w);
2356   gsl_vector_set_zero(z);
2357   vec_cat.clear();
2358 
2359   string rs, a1;
2360   size_t c = 0;
2361   if (msnpInfo.size() == 0) {
2362     for (size_t i = 0; i < snpInfo.size(); i++) {
2363       if (indicator_snp[i] == 0) {
2364         continue;
2365       }
2366 
2367       rs = snpInfo[i].rs_number;
2368       a1 = snpInfo[i].a_minor;
2369 
2370       if (mapRS2wA.count(rs) != 0) {
2371         if (a1 == mapRS2A1.at(rs)) {
2372           gsl_vector_set(z, c, mapRS2z.at(rs));
2373         } else {
2374           gsl_vector_set(z, c, -1 * mapRS2z.at(rs));
2375         }
2376         vec_cat.push_back(mapRS2cat.at(rs));
2377         gsl_vector_set(w, c, mapRS2wA.at(rs));
2378 
2379         c++;
2380       } else {
2381         indicator_snp[i] = 0;
2382       }
2383     }
2384   } else {
2385     for (size_t t = 0; t < msnpInfo.size(); t++) {
2386       snpInfo = msnpInfo[t];
2387 
2388       for (size_t i = 0; i < snpInfo.size(); i++) {
2389         if (mindicator_snp[t][i] == 0) {
2390           continue;
2391         }
2392 
2393         rs = snpInfo[i].rs_number;
2394         a1 = snpInfo[i].a_minor;
2395 
2396         if (mapRS2wA.count(rs) != 0) {
2397           if (a1 == mapRS2A1.at(rs)) {
2398             gsl_vector_set(z, c, mapRS2z.at(rs));
2399           } else {
2400             gsl_vector_set(z, c, -1 * mapRS2z.at(rs));
2401           }
2402           vec_cat.push_back(mapRS2cat.at(rs));
2403           gsl_vector_set(w, c, mapRS2wA.at(rs));
2404 
2405           c++;
2406         } else {
2407           mindicator_snp[t][i] = 0;
2408         }
2409       }
2410     }
2411   }
2412 
2413   return;
2414 }
2415 
2416 // This function updates indicator_snp, and save z-scores and other
2417 // values into vectors.
UpdateSNP(const map<string,double> & mapRS2wA)2418 void PARAM::UpdateSNP(const map<string, double> &mapRS2wA) {
2419   string rs;
2420   if (msnpInfo.size() == 0) {
2421     for (size_t i = 0; i < snpInfo.size(); i++) {
2422       if (indicator_snp[i] == 0) {
2423         continue;
2424       }
2425 
2426       rs = snpInfo[i].rs_number;
2427 
2428       if (mapRS2wA.count(rs) == 0) {
2429         indicator_snp[i] = 0;
2430       }
2431     }
2432   } else {
2433     for (size_t t = 0; t < msnpInfo.size(); t++) {
2434       snpInfo = msnpInfo[t];
2435 
2436       for (size_t i = 0; i < mindicator_snp[t].size(); i++) {
2437         if (mindicator_snp[t][i] == 0) {
2438           continue;
2439         }
2440 
2441         rs = snpInfo[i].rs_number;
2442 
2443         if (mapRS2wA.count(rs) == 0) {
2444           mindicator_snp[t][i] = 0;
2445         }
2446       }
2447     }
2448   }
2449 
2450   return;
2451 }
2452