1 /*
2  Genome-wide Efficient Mixed Model Association (GEMMA)
3  Copyright (C) 2011-2017, Xiang Zhou
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 */
18 
19 #include <fstream>
20 #include <iostream>
21 #include <sstream>
22 
23 #include <algorithm>
24 #include <cmath>
25 #include <cstring>
26 #include <ctime>
27 #include <iomanip>
28 #include <iostream>
29 #include <stdio.h>
30 #include <stdlib.h>
31 
32 #include "gsl/gsl_blas.h"
33 #include "gsl/gsl_cdf.h"
34 #include "gsl/gsl_eigen.h"
35 #include "gsl/gsl_linalg.h"
36 #include "gsl/gsl_matrix.h"
37 #include "gsl/gsl_randist.h"
38 #include "gsl/gsl_roots.h"
39 #include "gsl/gsl_vector.h"
40 
41 #include "bslmmdap.h"
42 #include "gemma_io.h"
43 #include "lapack.h"
44 #include "lm.h"
45 #include "lmm.h"
46 #include "logistic.h"
47 #include "mathfunc.h"
48 #include "param.h"
49 
50 using namespace std;
51 
CopyFromParam(PARAM & cPar)52 void BSLMMDAP::CopyFromParam(PARAM &cPar) {
53   file_out = cPar.file_out;
54   path_out = cPar.path_out;
55 
56   time_UtZ = 0.0;
57   time_Omega = 0.0;
58 
59   h_min = cPar.h_min;
60   h_max = cPar.h_max;
61   h_ngrid = cPar.h_ngrid;
62   rho_min = cPar.rho_min;
63   rho_max = cPar.rho_max;
64   rho_ngrid = cPar.rho_ngrid;
65 
66   if (h_min <= 0) {
67     h_min = 0.01;
68   }
69   if (h_max >= 1) {
70     h_max = 0.99;
71   }
72   if (rho_min <= 0) {
73     rho_min = 0.01;
74   }
75   if (rho_max >= 1) {
76     rho_max = 0.99;
77   }
78 
79   trace_G = cPar.trace_G;
80 
81   ni_total = cPar.ni_total;
82   ns_total = cPar.ns_total;
83   ni_test = cPar.ni_test;
84   ns_test = cPar.ns_test;
85 
86   indicator_idv = cPar.indicator_idv;
87   indicator_snp = cPar.indicator_snp;
88   snpInfo = cPar.snpInfo;
89 
90   return;
91 }
92 
CopyToParam(PARAM & cPar)93 void BSLMMDAP::CopyToParam(PARAM &cPar) {
94   cPar.time_UtZ = time_UtZ;
95   cPar.time_Omega = time_Omega;
96 
97   return;
98 }
99 
100 // Read hyp file.
ReadFile_hyb(const string & file_hyp,vector<double> & vec_sa2,vector<double> & vec_sb2,vector<double> & vec_wab)101 void ReadFile_hyb(const string &file_hyp, vector<double> &vec_sa2,
102                   vector<double> &vec_sb2, vector<double> &vec_wab) {
103   vec_sa2.clear();
104   vec_sb2.clear();
105   vec_wab.clear();
106 
107   igzstream infile(file_hyp.c_str(), igzstream::in);
108   if (!infile) {
109     cout << "error! fail to open hyp file: " << file_hyp << endl;
110     return;
111   }
112 
113   string line;
114   char *ch_ptr;
115 
116   getline(infile, line);
117 
118   while (!safeGetline(infile, line).eof()) {
119     ch_ptr = strtok_safe((char *)line.c_str(), " , \t");
120     ch_ptr = strtok_safe(NULL, " , \t");
121 
122     ch_ptr = strtok_safe(NULL, " , \t");
123     vec_sa2.push_back(atof(ch_ptr));
124 
125     ch_ptr = strtok_safe(NULL, " , \t");
126     vec_sb2.push_back(atof(ch_ptr));
127 
128     ch_ptr = strtok_safe(NULL, " , \t");
129     vec_wab.push_back(atof(ch_ptr));
130   }
131 
132   infile.close();
133   infile.clear();
134 
135   return;
136 }
137 
138 // Read bf file.
ReadFile_bf(const string & file_bf,vector<string> & vec_rs,vector<vector<vector<double>>> & BF)139 void ReadFile_bf(const string &file_bf, vector<string> &vec_rs,
140                  vector<vector<vector<double>>> &BF) {
141   BF.clear();
142   vec_rs.clear();
143 
144   igzstream infile(file_bf.c_str(), igzstream::in);
145   if (!infile) {
146     cout << "error! fail to open bf file: " << file_bf << endl;
147     return;
148   }
149 
150   string line, rs, block;
151   vector<double> vec_bf;
152   vector<vector<double>> mat_bf;
153   char *ch_ptr;
154 
155   size_t bf_size = 0, flag_block;
156 
157   getline(infile, line);
158 
159   size_t t = 0;
160   while (!safeGetline(infile, line).eof()) {
161     flag_block = 0;
162 
163     ch_ptr = strtok_safe((char *)line.c_str(), " , \t");
164     rs = ch_ptr;
165     vec_rs.push_back(rs);
166 
167     ch_ptr = strtok_safe(NULL, " , \t");
168     if (t == 0) {
169       block = ch_ptr;
170     } else {
171       if (strcmp(ch_ptr, block.c_str()) != 0) {
172         flag_block = 1;
173         block = ch_ptr;
174       }
175     }
176 
177     ch_ptr = strtok(NULL, " , \t");
178     while (ch_ptr != NULL) {
179       vec_bf.push_back(atof(ch_ptr));
180       ch_ptr = strtok(NULL, " , \t");
181     }
182 
183     if (t == 0) {
184       bf_size = vec_bf.size();
185     } else {
186       if (bf_size != vec_bf.size()) {
187         cout << "error! unequal row size in bf file." << endl;
188       }
189     }
190 
191     if (flag_block == 0) {
192       mat_bf.push_back(vec_bf);
193     } else {
194       BF.push_back(mat_bf);
195       mat_bf.clear();
196     }
197     vec_bf.clear();
198 
199     t++;
200   }
201 
202   infile.close();
203   infile.clear();
204 
205   return;
206 }
207 
208 // Read category files.
209 // Read both continuous and discrete category file, record mapRS2catc.
ReadFile_cat(const string & file_cat,const vector<string> & vec_rs,gsl_matrix * Ac,gsl_matrix_int * Ad,gsl_vector_int * dlevel,size_t & kc,size_t & kd)210 void ReadFile_cat(const string &file_cat, const vector<string> &vec_rs,
211                   gsl_matrix *Ac, gsl_matrix_int *Ad, gsl_vector_int *dlevel,
212                   size_t &kc, size_t &kd) {
213   igzstream infile(file_cat.c_str(), igzstream::in);
214   if (!infile) {
215     cout << "error! fail to open category file: " << file_cat << endl;
216     return;
217   }
218 
219   string line;
220   char *ch_ptr;
221 
222   string rs, chr, a1, a0, pos, cm;
223 
224   // Read header.
225   HEADER header;
226   safeGetline(infile, line).eof();
227   ReadHeader_io(line, header);
228 
229   // Use the header to determine the number of categories.
230   kc = header.catc_col.size();
231   kd = header.catd_col.size();
232 
233   // set up storage and mapper
234   map<string, vector<double>> mapRS2catc;
235   map<string, vector<int>> mapRS2catd;
236   vector<double> catc;
237   vector<int> catd;
238 
239   // Read the following lines to record mapRS2cat.
240   while (!safeGetline(infile, line).eof()) {
241     ch_ptr = strtok_safe((char *)line.c_str(), " , \t");
242 
243     if (header.rs_col == 0) {
244       rs = chr + ":" + pos;
245     }
246 
247     catc.clear();
248     catd.clear();
249 
250     for (size_t i = 0; i < header.coln; i++) {
251       enforce(ch_ptr);
252       if (header.rs_col != 0 && header.rs_col == i + 1) {
253         rs = ch_ptr;
254       } else if (header.chr_col != 0 && header.chr_col == i + 1) {
255         chr = ch_ptr;
256       } else if (header.pos_col != 0 && header.pos_col == i + 1) {
257         pos = ch_ptr;
258       } else if (header.cm_col != 0 && header.cm_col == i + 1) {
259         cm = ch_ptr;
260       } else if (header.a1_col != 0 && header.a1_col == i + 1) {
261         a1 = ch_ptr;
262       } else if (header.a0_col != 0 && header.a0_col == i + 1) {
263         a0 = ch_ptr;
264       } else if (header.catc_col.size() != 0 &&
265                  header.catc_col.count(i + 1) != 0) {
266         catc.push_back(atof(ch_ptr));
267       } else if (header.catd_col.size() != 0 &&
268                  header.catd_col.count(i + 1) != 0) {
269         catd.push_back(atoi(ch_ptr));
270       } else {
271       }
272 
273       ch_ptr = strtok(NULL, " , \t");
274     }
275 
276     if (mapRS2catc.count(rs) == 0 && kc > 0) {
277       mapRS2catc[rs] = catc;
278     }
279     if (mapRS2catd.count(rs) == 0 && kd > 0) {
280       mapRS2catd[rs] = catd;
281     }
282   }
283 
284   // Load into Ad and Ac.
285   if (kc > 0) {
286     Ac = gsl_matrix_alloc(vec_rs.size(), kc);
287     for (size_t i = 0; i < vec_rs.size(); i++) {
288       if (mapRS2catc.count(vec_rs[i]) != 0) {
289         for (size_t j = 0; j < kc; j++) {
290           gsl_matrix_set(Ac, i, j, mapRS2catc[vec_rs[i]][j]);
291         }
292       } else {
293         for (size_t j = 0; j < kc; j++) {
294           gsl_matrix_set(Ac, i, j, 0);
295         }
296       }
297     }
298   }
299 
300   if (kd > 0) {
301     Ad = gsl_matrix_int_alloc(vec_rs.size(), kd);
302 
303     for (size_t i = 0; i < vec_rs.size(); i++) {
304       if (mapRS2catd.count(vec_rs[i]) != 0) {
305         for (size_t j = 0; j < kd; j++) {
306           gsl_matrix_int_set(Ad, i, j, mapRS2catd[vec_rs[i]][j]);
307         }
308       } else {
309         for (size_t j = 0; j < kd; j++) {
310           gsl_matrix_int_set(Ad, i, j, 0);
311         }
312       }
313     }
314 
315     dlevel = gsl_vector_int_alloc(kd);
316     map<int, int> rcd;
317     int val;
318     for (size_t j = 0; j < kd; j++) {
319       rcd.clear();
320       for (size_t i = 0; i < Ad->size1; i++) {
321         val = gsl_matrix_int_get(Ad, i, j);
322         rcd[val] = 1;
323       }
324       gsl_vector_int_set(dlevel, j, rcd.size());
325     }
326   }
327 
328   infile.clear();
329   infile.close();
330 
331   return;
332 }
333 
WriteResult(const gsl_matrix * Hyper,const gsl_matrix * BF)334 void BSLMMDAP::WriteResult(const gsl_matrix *Hyper, const gsl_matrix *BF) {
335   string file_bf, file_hyp;
336   file_bf = path_out + "/" + file_out;
337   file_bf += ".bf.txt";
338   file_hyp = path_out + "/" + file_out;
339   file_hyp += ".hyp.txt";
340 
341   ofstream outfile_bf, outfile_hyp;
342 
343   outfile_bf.open(file_bf.c_str(), ofstream::out);
344   outfile_hyp.open(file_hyp.c_str(), ofstream::out);
345 
346   if (!outfile_bf) {
347     cout << "error writing file: " << file_bf << endl;
348     return;
349   }
350   if (!outfile_hyp) {
351     cout << "error writing file: " << file_hyp << endl;
352     return;
353   }
354 
355   outfile_hyp << "h"
356               << "\t"
357               << "rho"
358               << "\t"
359               << "sa2"
360               << "\t"
361               << "sb2"
362               << "\t"
363               << "weight" << endl;
364   outfile_hyp << scientific;
365   for (size_t i = 0; i < Hyper->size1; i++) {
366     for (size_t j = 0; j < Hyper->size2; j++) {
367       outfile_hyp << setprecision(6) << gsl_matrix_get(Hyper, i, j) << "\t";
368     }
369     outfile_hyp << endl;
370   }
371 
372   outfile_bf << "chr"
373              << "\t"
374              << "rs"
375              << "\t"
376              << "ps"
377              << "\t"
378              << "n_miss";
379   for (size_t i = 0; i < BF->size2; i++) {
380     outfile_bf << "\t"
381                << "BF" << i + 1;
382   }
383   outfile_bf << endl;
384 
385   size_t t = 0;
386   for (size_t i = 0; i < ns_total; ++i) {
387     if (indicator_snp[i] == 0) {
388       continue;
389     }
390 
391     outfile_bf << snpInfo[i].chr << "\t" << snpInfo[i].rs_number << "\t"
392                << snpInfo[i].base_position << "\t" << snpInfo[i].n_miss;
393 
394     outfile_bf << scientific;
395     for (size_t j = 0; j < BF->size2; j++) {
396       outfile_bf << "\t" << setprecision(6) << gsl_matrix_get(BF, t, j);
397     }
398     outfile_bf << endl;
399 
400     t++;
401   }
402 
403   outfile_hyp.close();
404   outfile_hyp.clear();
405   outfile_bf.close();
406   outfile_bf.clear();
407   return;
408 }
409 
WriteResult(const vector<string> & vec_rs,const gsl_matrix * Hyper,const gsl_vector * pip,const gsl_vector * coef)410 void BSLMMDAP::WriteResult(const vector<string> &vec_rs,
411                            const gsl_matrix *Hyper, const gsl_vector *pip,
412                            const gsl_vector *coef) {
413   string file_gamma, file_hyp, file_coef;
414   file_gamma = path_out + "/" + file_out;
415   file_gamma += ".gamma.txt";
416   file_hyp = path_out + "/" + file_out;
417   file_hyp += ".hyp.txt";
418   file_coef = path_out + "/" + file_out;
419   file_coef += ".coef.txt";
420 
421   ofstream outfile_gamma, outfile_hyp, outfile_coef;
422 
423   outfile_gamma.open(file_gamma.c_str(), ofstream::out);
424   outfile_hyp.open(file_hyp.c_str(), ofstream::out);
425   outfile_coef.open(file_coef.c_str(), ofstream::out);
426 
427   if (!outfile_gamma) {
428     cout << "error writing file: " << file_gamma << endl;
429     return;
430   }
431   if (!outfile_hyp) {
432     cout << "error writing file: " << file_hyp << endl;
433     return;
434   }
435   if (!outfile_coef) {
436     cout << "error writing file: " << file_coef << endl;
437     return;
438   }
439 
440   outfile_hyp << "h"
441               << "\t"
442               << "rho"
443               << "\t"
444               << "sa2"
445               << "\t"
446               << "sb2"
447               << "\t"
448               << "weight" << endl;
449   outfile_hyp << scientific;
450   for (size_t i = 0; i < Hyper->size1; i++) {
451     for (size_t j = 0; j < Hyper->size2; j++) {
452       outfile_hyp << setprecision(6) << gsl_matrix_get(Hyper, i, j) << "\t";
453     }
454     outfile_hyp << endl;
455   }
456 
457   outfile_gamma << "rs"
458                 << "\t"
459                 << "gamma" << endl;
460   for (size_t i = 0; i < vec_rs.size(); ++i) {
461     outfile_gamma << vec_rs[i] << "\t" << scientific << setprecision(6)
462                   << gsl_vector_get(pip, i) << endl;
463   }
464 
465   outfile_coef << "coef" << endl;
466   outfile_coef << scientific;
467   for (size_t i = 0; i < coef->size; i++) {
468     outfile_coef << setprecision(6) << gsl_vector_get(coef, i) << endl;
469   }
470 
471   outfile_coef.close();
472   outfile_coef.clear();
473   outfile_hyp.close();
474   outfile_hyp.clear();
475   outfile_gamma.close();
476   outfile_gamma.clear();
477   return;
478 }
479 
CalcMarginal(const gsl_vector * Uty,const gsl_vector * K_eval,const double sigma_b2,const double tau)480 double BSLMMDAP::CalcMarginal(const gsl_vector *Uty, const gsl_vector *K_eval,
481                               const double sigma_b2, const double tau) {
482   gsl_vector *weight_Hi = gsl_vector_alloc(Uty->size);
483 
484   double logm = 0.0;
485   double d, uy, Hi_yy = 0, logdet_H = 0.0;
486   for (size_t i = 0; i < ni_test; ++i) {
487     d = gsl_vector_get(K_eval, i) * sigma_b2;
488     d = 1.0 / (d + 1.0);
489     gsl_vector_set(weight_Hi, i, d);
490 
491     logdet_H -= log(d);
492     uy = gsl_vector_get(Uty, i);
493     Hi_yy += d * uy * uy;
494   }
495 
496   // Calculate likelihood.
497   logm = -0.5 * logdet_H - 0.5 * tau * Hi_yy + 0.5 * log(tau) * (double)ni_test;
498 
499   gsl_vector_free(weight_Hi);
500 
501   return logm;
502 }
503 
CalcMarginal(const gsl_matrix * UtXgamma,const gsl_vector * Uty,const gsl_vector * K_eval,const double sigma_a2,const double sigma_b2,const double tau)504 double BSLMMDAP::CalcMarginal(const gsl_matrix *UtXgamma, const gsl_vector *Uty,
505                               const gsl_vector *K_eval, const double sigma_a2,
506                               const double sigma_b2, const double tau) {
507   clock_t time_start;
508   double logm = 0.0;
509   double d, uy, P_yy = 0, logdet_O = 0.0, logdet_H = 0.0;
510 
511   gsl_matrix *UtXgamma_eval =
512       gsl_matrix_alloc(UtXgamma->size1, UtXgamma->size2);
513   gsl_matrix *Omega = gsl_matrix_alloc(UtXgamma->size2, UtXgamma->size2);
514   gsl_vector *XtHiy = gsl_vector_alloc(UtXgamma->size2);
515   gsl_vector *beta_hat = gsl_vector_alloc(UtXgamma->size2);
516   gsl_vector *weight_Hi = gsl_vector_alloc(UtXgamma->size1);
517 
518   gsl_matrix_memcpy(UtXgamma_eval, UtXgamma);
519 
520   logdet_H = 0.0;
521   P_yy = 0.0;
522   for (size_t i = 0; i < ni_test; ++i) {
523     gsl_vector_view UtXgamma_row = gsl_matrix_row(UtXgamma_eval, i);
524     d = gsl_vector_get(K_eval, i) * sigma_b2;
525     d = 1.0 / (d + 1.0);
526     gsl_vector_set(weight_Hi, i, d);
527 
528     logdet_H -= log(d);
529     uy = gsl_vector_get(Uty, i);
530     P_yy += d * uy * uy;
531     gsl_vector_scale(&UtXgamma_row.vector, d);
532   }
533 
534   // Calculate Omega.
535   gsl_matrix_set_identity(Omega);
536 
537   time_start = clock();
538   lapack_dgemm((char *)"T", (char *)"N", sigma_a2, UtXgamma_eval, UtXgamma, 1.0,
539                Omega);
540   time_Omega += (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0);
541 
542   // Calculate beta_hat.
543   gsl_blas_dgemv(CblasTrans, 1.0, UtXgamma_eval, Uty, 0.0, XtHiy);
544 
545   logdet_O = CholeskySolve(Omega, XtHiy, beta_hat);
546 
547   gsl_vector_scale(beta_hat, sigma_a2);
548 
549   gsl_blas_ddot(XtHiy, beta_hat, &d);
550   P_yy -= d;
551 
552   gsl_matrix_free(UtXgamma_eval);
553   gsl_matrix_free(Omega);
554   gsl_vector_free(XtHiy);
555   gsl_vector_free(beta_hat);
556   gsl_vector_free(weight_Hi);
557 
558   logm = -0.5 * logdet_H - 0.5 * logdet_O - 0.5 * tau * P_yy +
559          0.5 * log(tau) * (double)ni_test;
560 
561   return logm;
562 }
563 
CalcPrior(class HYPBSLMM & cHyp)564 double BSLMMDAP::CalcPrior(class HYPBSLMM &cHyp) {
565   double logprior = 0;
566   logprior =
567       ((double)cHyp.n_gamma - 1.0) * cHyp.logp +
568       ((double)ns_test - (double)cHyp.n_gamma) * log(1.0 - exp(cHyp.logp));
569   return logprior;
570 }
571 
572 // Where A is the ni_test by n_cat matrix of annotations.
DAP_CalcBF(const gsl_matrix * U,const gsl_matrix * UtX,const gsl_vector * Uty,const gsl_vector * K_eval,const gsl_vector * y)573 void BSLMMDAP::DAP_CalcBF(const gsl_matrix *U, const gsl_matrix *UtX,
574                           const gsl_vector *Uty, const gsl_vector *K_eval,
575                           const gsl_vector *y) {
576   clock_t time_start;
577 
578   // Set up BF.
579   double tau, h, rho, sigma_a2, sigma_b2, d;
580   size_t ns_causal = 10;
581   size_t n_grid = h_ngrid * rho_ngrid;
582   vector<double> vec_sa2, vec_sb2, logm_null;
583 
584   gsl_matrix *BF = gsl_matrix_alloc(ns_test, n_grid);
585   gsl_matrix *Xgamma = gsl_matrix_alloc(ni_test, 1);
586   gsl_matrix *Hyper = gsl_matrix_alloc(n_grid, 5);
587 
588   // Compute tau by using yty.
589   gsl_blas_ddot(Uty, Uty, &tau);
590   tau = (double)ni_test / tau;
591 
592   // Set up grid values for sigma_a2 and sigma_b2 based on an
593   // approximately even grid for h and rho, and a fixed number
594   // of causals.
595   size_t ij = 0;
596   for (size_t i = 0; i < h_ngrid; i++) {
597     h = h_min + (h_max - h_min) * (double)i / ((double)h_ngrid - 1);
598     for (size_t j = 0; j < rho_ngrid; j++) {
599       rho = rho_min + (rho_max - rho_min) * (double)j / ((double)rho_ngrid - 1);
600 
601       sigma_a2 = h * rho / ((1 - h) * (double)ns_causal);
602       sigma_b2 = h * (1.0 - rho) / (trace_G * (1 - h));
603 
604       vec_sa2.push_back(sigma_a2);
605       vec_sb2.push_back(sigma_b2);
606       logm_null.push_back(CalcMarginal(Uty, K_eval, 0.0, tau));
607 
608       gsl_matrix_set(Hyper, ij, 0, h);
609       gsl_matrix_set(Hyper, ij, 1, rho);
610       gsl_matrix_set(Hyper, ij, 2, sigma_a2);
611       gsl_matrix_set(Hyper, ij, 3, sigma_b2);
612       gsl_matrix_set(Hyper, ij, 4, 1 / (double)n_grid);
613       ij++;
614     }
615   }
616 
617   // Compute BF factors.
618   time_start = clock();
619   cout << "Calculating BF..." << endl;
620   for (size_t t = 0; t < ns_test; t++) {
621     gsl_vector_view Xgamma_col = gsl_matrix_column(Xgamma, 0);
622     gsl_vector_const_view X_col = gsl_matrix_const_column(UtX, t);
623     gsl_vector_memcpy(&Xgamma_col.vector, &X_col.vector);
624 
625     for (size_t ij = 0; ij < n_grid; ij++) {
626       sigma_a2 = vec_sa2[ij];
627       sigma_b2 = vec_sb2[ij];
628 
629       d = CalcMarginal(Xgamma, Uty, K_eval, sigma_a2, sigma_b2, tau);
630       d -= logm_null[ij];
631       d = exp(d);
632 
633       gsl_matrix_set(BF, t, ij, d);
634     }
635   }
636   time_Proposal = (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0);
637 
638   // Save results.
639   WriteResult(Hyper, BF);
640 
641   // Free matrices and vectors.
642   gsl_matrix_free(BF);
643   gsl_matrix_free(Xgamma);
644   gsl_matrix_free(Hyper);
645   return;
646 }
647 
single_ct_regression(const gsl_matrix_int * Xd,const gsl_vector_int * dlevel,const gsl_vector * pip_vec,gsl_vector * coef,gsl_vector * prior_vec)648 void single_ct_regression(const gsl_matrix_int *Xd,
649                           const gsl_vector_int *dlevel,
650                           const gsl_vector *pip_vec, gsl_vector *coef,
651                           gsl_vector *prior_vec) {
652 
653   map<int, double> sum_pip;
654   map<int, double> sum;
655 
656   int levels = gsl_vector_int_get(dlevel, 0);
657 
658   for (int i = 0; i < levels; i++) {
659     sum_pip[i] = sum[i] = 0;
660   }
661 
662   for (size_t i = 0; i < Xd->size1; i++) {
663     int cat = gsl_matrix_int_get(Xd, i, 0);
664     sum_pip[cat] += gsl_vector_get(pip_vec, i);
665     sum[cat] += 1;
666   }
667 
668   for (size_t i = 0; i < Xd->size1; i++) {
669     int cat = gsl_matrix_int_get(Xd, i, 0);
670     gsl_vector_set(prior_vec, i, sum_pip[cat] / sum[cat]);
671   }
672 
673   for (int i = 0; i < levels; i++) {
674     double new_prior = sum_pip[i] / sum[i];
675     gsl_vector_set(coef, i, log(new_prior / (1 - new_prior)));
676   }
677 
678   return;
679 }
680 
681 // Where A is the ni_test by n_cat matrix of annotations.
DAP_EstimateHyper(const size_t kc,const size_t kd,const vector<string> & vec_rs,const vector<double> & vec_sa2,const vector<double> & vec_sb2,const vector<double> & wab,const vector<vector<vector<double>>> & BF,gsl_matrix * Ac,gsl_matrix_int * Ad,gsl_vector_int * dlevel)682 void BSLMMDAP::DAP_EstimateHyper(
683     const size_t kc, const size_t kd, const vector<string> &vec_rs,
684     const vector<double> &vec_sa2, const vector<double> &vec_sb2,
685     const vector<double> &wab, const vector<vector<vector<double>>> &BF,
686     gsl_matrix *Ac, gsl_matrix_int *Ad, gsl_vector_int *dlevel) {
687   // clock_t time_start;
688 
689   // Set up BF.
690   double h, rho, sigma_a2, sigma_b2, d, s, logm, logm_save = nan("");
691   size_t t1, t2;
692   size_t n_grid = wab.size(), ns_test = vec_rs.size();
693 
694   gsl_vector *prior_vec = gsl_vector_alloc(ns_test);
695   gsl_matrix *Hyper = gsl_matrix_alloc(n_grid, 5);
696   gsl_vector *pip = gsl_vector_alloc(ns_test);
697   gsl_vector *coef = gsl_vector_alloc(kc + kd + 1);
698 
699   // Perform the EM algorithm.
700   vector<double> vec_wab, vec_wab_new;
701 
702   // Initial values.
703   for (size_t t = 0; t < ns_test; t++) {
704     gsl_vector_set(prior_vec, t, (double)BF.size() / (double)ns_test);
705   }
706   for (size_t ij = 0; ij < n_grid; ij++) {
707     vec_wab.push_back(wab[ij]);
708     vec_wab_new.push_back(wab[ij]);
709   }
710 
711   // EM iteration.
712   size_t it = 0;
713   double dif = 1;
714   while (it < 100 && dif > 1e-3) {
715 
716     // Update E_gamma.
717     t1 = 0, t2 = 0;
718     for (size_t b = 0; b < BF.size(); b++) {
719       s = 1;
720       for (size_t m = 0; m < BF[b].size(); m++) {
721         d = 0;
722         for (size_t ij = 0; ij < n_grid; ij++) {
723           d += vec_wab_new[ij] * BF[b][m][ij];
724         }
725         d *=
726             gsl_vector_get(prior_vec, t1) / (1 - gsl_vector_get(prior_vec, t1));
727 
728         gsl_vector_set(pip, t1, d);
729         s += d;
730         t1++;
731       }
732 
733       for (size_t m = 0; m < BF[b].size(); m++) {
734         d = gsl_vector_get(pip, t2) / s;
735         gsl_vector_set(pip, t2, d);
736         t2++;
737       }
738     }
739 
740     // Update E_wab.
741     s = 0;
742     for (size_t ij = 0; ij < n_grid; ij++) {
743       vec_wab_new[ij] = 0;
744 
745       t1 = 0;
746       for (size_t b = 0; b < BF.size(); b++) {
747         d = 1;
748         for (size_t m = 0; m < BF[b].size(); m++) {
749           d += gsl_vector_get(prior_vec, t1) /
750                (1 - gsl_vector_get(prior_vec, t1)) * vec_wab[ij] * BF[b][m][ij];
751           t1++;
752         }
753         vec_wab_new[ij] += log(d);
754       }
755 
756       s = max(s, vec_wab_new[ij]);
757     }
758 
759     d = 0;
760     for (size_t ij = 0; ij < n_grid; ij++) {
761       vec_wab_new[ij] = exp(vec_wab_new[ij] - s);
762       d += vec_wab_new[ij];
763     }
764 
765     for (size_t ij = 0; ij < n_grid; ij++) {
766       vec_wab_new[ij] /= d;
767     }
768 
769     // Update coef, and pi.
770     if (kc == 0 && kd == 0) {
771 
772       // No annotation.
773       s = 0;
774       for (size_t t = 0; t < pip->size; t++) {
775         s += gsl_vector_get(pip, t);
776       }
777       s = s / (double)pip->size;
778       for (size_t t = 0; t < pip->size; t++) {
779         gsl_vector_set(prior_vec, t, s);
780       }
781 
782       gsl_vector_set(coef, 0, log(s / (1 - s)));
783     } else if (kc == 0 && kd != 0) {
784 
785       // Only discrete annotations.
786       if (kd == 1) {
787         single_ct_regression(Ad, dlevel, pip, coef, prior_vec);
788       } else {
789         logistic_cat_fit(coef, Ad, dlevel, pip, 0, 0);
790         logistic_cat_pred(coef, Ad, dlevel, prior_vec);
791       }
792     } else if (kc != 0 && kd == 0) {
793 
794       // Only continuous annotations.
795       logistic_cont_fit(coef, Ac, pip, 0, 0);
796       logistic_cont_pred(coef, Ac, prior_vec);
797     } else if (kc != 0 && kd != 0) {
798 
799       // Both continuous and categorical annotations.
800       logistic_mixed_fit(coef, Ad, dlevel, Ac, pip, 0, 0);
801       logistic_mixed_pred(coef, Ad, dlevel, Ac, prior_vec);
802     }
803 
804     // Compute marginal likelihood.
805     logm = 0;
806 
807     t1 = 0;
808     for (size_t b = 0; b < BF.size(); b++) {
809       d = 1;
810       s = 0;
811       for (size_t m = 0; m < BF[b].size(); m++) {
812         s += log(1 - gsl_vector_get(prior_vec, t1));
813         for (size_t ij = 0; ij < n_grid; ij++) {
814           d += gsl_vector_get(prior_vec, t1) /
815                (1 - gsl_vector_get(prior_vec, t1)) * vec_wab[ij] * BF[b][m][ij];
816         }
817       }
818       logm += log(d) + s;
819       t1++;
820     }
821 
822     if (it > 0) {
823       dif = logm - logm_save;
824     }
825     logm_save = logm;
826     it++;
827 
828     cout << "iteration = " << it << "; marginal likelihood = " << logm << endl;
829   }
830 
831   // Update h and rho that correspond to w_ab.
832   for (size_t ij = 0; ij < n_grid; ij++) {
833     sigma_a2 = vec_sa2[ij];
834     sigma_b2 = vec_sb2[ij];
835 
836     d = exp(gsl_vector_get(coef, coef->size - 1)) /
837         (1 + exp(gsl_vector_get(coef, coef->size - 1)));
838     h = (d * (double)ns_test * sigma_a2 + 1 * sigma_b2) /
839         (1 + d * (double)ns_test * sigma_a2 + 1 * sigma_b2);
840     rho = d * (double)ns_test * sigma_a2 /
841           (d * (double)ns_test * sigma_a2 + 1 * sigma_b2);
842 
843     gsl_matrix_set(Hyper, ij, 0, h);
844     gsl_matrix_set(Hyper, ij, 1, rho);
845     gsl_matrix_set(Hyper, ij, 2, sigma_a2);
846     gsl_matrix_set(Hyper, ij, 3, sigma_b2);
847     gsl_matrix_set(Hyper, ij, 4, vec_wab_new[ij]);
848   }
849 
850   // Obtain beta and alpha parameters.
851 
852   // Save results.
853   WriteResult(vec_rs, Hyper, pip, coef);
854 
855   // Free matrices and vectors.
856   gsl_vector_free(prior_vec);
857   gsl_matrix_free(Hyper);
858   gsl_vector_free(pip);
859   gsl_vector_free(coef);
860   return;
861 }
862