1 /*
2 Genome-wide Efficient Mixed Model Association (GEMMA)
3 Copyright (C) 2011-2017, Xiang Zhou
4
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3 of the License, or
8 (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>.
17 */
18
19 #include <fstream>
20 #include <iostream>
21 #include <sstream>
22
23 #include <algorithm>
24 #include <cmath>
25 #include <cstring>
26 #include <ctime>
27 #include <iomanip>
28 #include <iostream>
29 #include <stdio.h>
30 #include <stdlib.h>
31
32 #include "gsl/gsl_blas.h"
33 #include "gsl/gsl_cdf.h"
34 #include "gsl/gsl_eigen.h"
35 #include "gsl/gsl_linalg.h"
36 #include "gsl/gsl_matrix.h"
37 #include "gsl/gsl_randist.h"
38 #include "gsl/gsl_roots.h"
39 #include "gsl/gsl_vector.h"
40
41 #include "bslmmdap.h"
42 #include "gemma_io.h"
43 #include "lapack.h"
44 #include "lm.h"
45 #include "lmm.h"
46 #include "logistic.h"
47 #include "mathfunc.h"
48 #include "param.h"
49
50 using namespace std;
51
CopyFromParam(PARAM & cPar)52 void BSLMMDAP::CopyFromParam(PARAM &cPar) {
53 file_out = cPar.file_out;
54 path_out = cPar.path_out;
55
56 time_UtZ = 0.0;
57 time_Omega = 0.0;
58
59 h_min = cPar.h_min;
60 h_max = cPar.h_max;
61 h_ngrid = cPar.h_ngrid;
62 rho_min = cPar.rho_min;
63 rho_max = cPar.rho_max;
64 rho_ngrid = cPar.rho_ngrid;
65
66 if (h_min <= 0) {
67 h_min = 0.01;
68 }
69 if (h_max >= 1) {
70 h_max = 0.99;
71 }
72 if (rho_min <= 0) {
73 rho_min = 0.01;
74 }
75 if (rho_max >= 1) {
76 rho_max = 0.99;
77 }
78
79 trace_G = cPar.trace_G;
80
81 ni_total = cPar.ni_total;
82 ns_total = cPar.ns_total;
83 ni_test = cPar.ni_test;
84 ns_test = cPar.ns_test;
85
86 indicator_idv = cPar.indicator_idv;
87 indicator_snp = cPar.indicator_snp;
88 snpInfo = cPar.snpInfo;
89
90 return;
91 }
92
CopyToParam(PARAM & cPar)93 void BSLMMDAP::CopyToParam(PARAM &cPar) {
94 cPar.time_UtZ = time_UtZ;
95 cPar.time_Omega = time_Omega;
96
97 return;
98 }
99
100 // Read hyp file.
ReadFile_hyb(const string & file_hyp,vector<double> & vec_sa2,vector<double> & vec_sb2,vector<double> & vec_wab)101 void ReadFile_hyb(const string &file_hyp, vector<double> &vec_sa2,
102 vector<double> &vec_sb2, vector<double> &vec_wab) {
103 vec_sa2.clear();
104 vec_sb2.clear();
105 vec_wab.clear();
106
107 igzstream infile(file_hyp.c_str(), igzstream::in);
108 if (!infile) {
109 cout << "error! fail to open hyp file: " << file_hyp << endl;
110 return;
111 }
112
113 string line;
114 char *ch_ptr;
115
116 getline(infile, line);
117
118 while (!safeGetline(infile, line).eof()) {
119 ch_ptr = strtok_safe((char *)line.c_str(), " , \t");
120 ch_ptr = strtok_safe(NULL, " , \t");
121
122 ch_ptr = strtok_safe(NULL, " , \t");
123 vec_sa2.push_back(atof(ch_ptr));
124
125 ch_ptr = strtok_safe(NULL, " , \t");
126 vec_sb2.push_back(atof(ch_ptr));
127
128 ch_ptr = strtok_safe(NULL, " , \t");
129 vec_wab.push_back(atof(ch_ptr));
130 }
131
132 infile.close();
133 infile.clear();
134
135 return;
136 }
137
138 // Read bf file.
ReadFile_bf(const string & file_bf,vector<string> & vec_rs,vector<vector<vector<double>>> & BF)139 void ReadFile_bf(const string &file_bf, vector<string> &vec_rs,
140 vector<vector<vector<double>>> &BF) {
141 BF.clear();
142 vec_rs.clear();
143
144 igzstream infile(file_bf.c_str(), igzstream::in);
145 if (!infile) {
146 cout << "error! fail to open bf file: " << file_bf << endl;
147 return;
148 }
149
150 string line, rs, block;
151 vector<double> vec_bf;
152 vector<vector<double>> mat_bf;
153 char *ch_ptr;
154
155 size_t bf_size = 0, flag_block;
156
157 getline(infile, line);
158
159 size_t t = 0;
160 while (!safeGetline(infile, line).eof()) {
161 flag_block = 0;
162
163 ch_ptr = strtok_safe((char *)line.c_str(), " , \t");
164 rs = ch_ptr;
165 vec_rs.push_back(rs);
166
167 ch_ptr = strtok_safe(NULL, " , \t");
168 if (t == 0) {
169 block = ch_ptr;
170 } else {
171 if (strcmp(ch_ptr, block.c_str()) != 0) {
172 flag_block = 1;
173 block = ch_ptr;
174 }
175 }
176
177 ch_ptr = strtok(NULL, " , \t");
178 while (ch_ptr != NULL) {
179 vec_bf.push_back(atof(ch_ptr));
180 ch_ptr = strtok(NULL, " , \t");
181 }
182
183 if (t == 0) {
184 bf_size = vec_bf.size();
185 } else {
186 if (bf_size != vec_bf.size()) {
187 cout << "error! unequal row size in bf file." << endl;
188 }
189 }
190
191 if (flag_block == 0) {
192 mat_bf.push_back(vec_bf);
193 } else {
194 BF.push_back(mat_bf);
195 mat_bf.clear();
196 }
197 vec_bf.clear();
198
199 t++;
200 }
201
202 infile.close();
203 infile.clear();
204
205 return;
206 }
207
208 // Read category files.
209 // Read both continuous and discrete category file, record mapRS2catc.
ReadFile_cat(const string & file_cat,const vector<string> & vec_rs,gsl_matrix * Ac,gsl_matrix_int * Ad,gsl_vector_int * dlevel,size_t & kc,size_t & kd)210 void ReadFile_cat(const string &file_cat, const vector<string> &vec_rs,
211 gsl_matrix *Ac, gsl_matrix_int *Ad, gsl_vector_int *dlevel,
212 size_t &kc, size_t &kd) {
213 igzstream infile(file_cat.c_str(), igzstream::in);
214 if (!infile) {
215 cout << "error! fail to open category file: " << file_cat << endl;
216 return;
217 }
218
219 string line;
220 char *ch_ptr;
221
222 string rs, chr, a1, a0, pos, cm;
223
224 // Read header.
225 HEADER header;
226 safeGetline(infile, line).eof();
227 ReadHeader_io(line, header);
228
229 // Use the header to determine the number of categories.
230 kc = header.catc_col.size();
231 kd = header.catd_col.size();
232
233 // set up storage and mapper
234 map<string, vector<double>> mapRS2catc;
235 map<string, vector<int>> mapRS2catd;
236 vector<double> catc;
237 vector<int> catd;
238
239 // Read the following lines to record mapRS2cat.
240 while (!safeGetline(infile, line).eof()) {
241 ch_ptr = strtok_safe((char *)line.c_str(), " , \t");
242
243 if (header.rs_col == 0) {
244 rs = chr + ":" + pos;
245 }
246
247 catc.clear();
248 catd.clear();
249
250 for (size_t i = 0; i < header.coln; i++) {
251 enforce(ch_ptr);
252 if (header.rs_col != 0 && header.rs_col == i + 1) {
253 rs = ch_ptr;
254 } else if (header.chr_col != 0 && header.chr_col == i + 1) {
255 chr = ch_ptr;
256 } else if (header.pos_col != 0 && header.pos_col == i + 1) {
257 pos = ch_ptr;
258 } else if (header.cm_col != 0 && header.cm_col == i + 1) {
259 cm = ch_ptr;
260 } else if (header.a1_col != 0 && header.a1_col == i + 1) {
261 a1 = ch_ptr;
262 } else if (header.a0_col != 0 && header.a0_col == i + 1) {
263 a0 = ch_ptr;
264 } else if (header.catc_col.size() != 0 &&
265 header.catc_col.count(i + 1) != 0) {
266 catc.push_back(atof(ch_ptr));
267 } else if (header.catd_col.size() != 0 &&
268 header.catd_col.count(i + 1) != 0) {
269 catd.push_back(atoi(ch_ptr));
270 } else {
271 }
272
273 ch_ptr = strtok(NULL, " , \t");
274 }
275
276 if (mapRS2catc.count(rs) == 0 && kc > 0) {
277 mapRS2catc[rs] = catc;
278 }
279 if (mapRS2catd.count(rs) == 0 && kd > 0) {
280 mapRS2catd[rs] = catd;
281 }
282 }
283
284 // Load into Ad and Ac.
285 if (kc > 0) {
286 Ac = gsl_matrix_alloc(vec_rs.size(), kc);
287 for (size_t i = 0; i < vec_rs.size(); i++) {
288 if (mapRS2catc.count(vec_rs[i]) != 0) {
289 for (size_t j = 0; j < kc; j++) {
290 gsl_matrix_set(Ac, i, j, mapRS2catc[vec_rs[i]][j]);
291 }
292 } else {
293 for (size_t j = 0; j < kc; j++) {
294 gsl_matrix_set(Ac, i, j, 0);
295 }
296 }
297 }
298 }
299
300 if (kd > 0) {
301 Ad = gsl_matrix_int_alloc(vec_rs.size(), kd);
302
303 for (size_t i = 0; i < vec_rs.size(); i++) {
304 if (mapRS2catd.count(vec_rs[i]) != 0) {
305 for (size_t j = 0; j < kd; j++) {
306 gsl_matrix_int_set(Ad, i, j, mapRS2catd[vec_rs[i]][j]);
307 }
308 } else {
309 for (size_t j = 0; j < kd; j++) {
310 gsl_matrix_int_set(Ad, i, j, 0);
311 }
312 }
313 }
314
315 dlevel = gsl_vector_int_alloc(kd);
316 map<int, int> rcd;
317 int val;
318 for (size_t j = 0; j < kd; j++) {
319 rcd.clear();
320 for (size_t i = 0; i < Ad->size1; i++) {
321 val = gsl_matrix_int_get(Ad, i, j);
322 rcd[val] = 1;
323 }
324 gsl_vector_int_set(dlevel, j, rcd.size());
325 }
326 }
327
328 infile.clear();
329 infile.close();
330
331 return;
332 }
333
WriteResult(const gsl_matrix * Hyper,const gsl_matrix * BF)334 void BSLMMDAP::WriteResult(const gsl_matrix *Hyper, const gsl_matrix *BF) {
335 string file_bf, file_hyp;
336 file_bf = path_out + "/" + file_out;
337 file_bf += ".bf.txt";
338 file_hyp = path_out + "/" + file_out;
339 file_hyp += ".hyp.txt";
340
341 ofstream outfile_bf, outfile_hyp;
342
343 outfile_bf.open(file_bf.c_str(), ofstream::out);
344 outfile_hyp.open(file_hyp.c_str(), ofstream::out);
345
346 if (!outfile_bf) {
347 cout << "error writing file: " << file_bf << endl;
348 return;
349 }
350 if (!outfile_hyp) {
351 cout << "error writing file: " << file_hyp << endl;
352 return;
353 }
354
355 outfile_hyp << "h"
356 << "\t"
357 << "rho"
358 << "\t"
359 << "sa2"
360 << "\t"
361 << "sb2"
362 << "\t"
363 << "weight" << endl;
364 outfile_hyp << scientific;
365 for (size_t i = 0; i < Hyper->size1; i++) {
366 for (size_t j = 0; j < Hyper->size2; j++) {
367 outfile_hyp << setprecision(6) << gsl_matrix_get(Hyper, i, j) << "\t";
368 }
369 outfile_hyp << endl;
370 }
371
372 outfile_bf << "chr"
373 << "\t"
374 << "rs"
375 << "\t"
376 << "ps"
377 << "\t"
378 << "n_miss";
379 for (size_t i = 0; i < BF->size2; i++) {
380 outfile_bf << "\t"
381 << "BF" << i + 1;
382 }
383 outfile_bf << endl;
384
385 size_t t = 0;
386 for (size_t i = 0; i < ns_total; ++i) {
387 if (indicator_snp[i] == 0) {
388 continue;
389 }
390
391 outfile_bf << snpInfo[i].chr << "\t" << snpInfo[i].rs_number << "\t"
392 << snpInfo[i].base_position << "\t" << snpInfo[i].n_miss;
393
394 outfile_bf << scientific;
395 for (size_t j = 0; j < BF->size2; j++) {
396 outfile_bf << "\t" << setprecision(6) << gsl_matrix_get(BF, t, j);
397 }
398 outfile_bf << endl;
399
400 t++;
401 }
402
403 outfile_hyp.close();
404 outfile_hyp.clear();
405 outfile_bf.close();
406 outfile_bf.clear();
407 return;
408 }
409
WriteResult(const vector<string> & vec_rs,const gsl_matrix * Hyper,const gsl_vector * pip,const gsl_vector * coef)410 void BSLMMDAP::WriteResult(const vector<string> &vec_rs,
411 const gsl_matrix *Hyper, const gsl_vector *pip,
412 const gsl_vector *coef) {
413 string file_gamma, file_hyp, file_coef;
414 file_gamma = path_out + "/" + file_out;
415 file_gamma += ".gamma.txt";
416 file_hyp = path_out + "/" + file_out;
417 file_hyp += ".hyp.txt";
418 file_coef = path_out + "/" + file_out;
419 file_coef += ".coef.txt";
420
421 ofstream outfile_gamma, outfile_hyp, outfile_coef;
422
423 outfile_gamma.open(file_gamma.c_str(), ofstream::out);
424 outfile_hyp.open(file_hyp.c_str(), ofstream::out);
425 outfile_coef.open(file_coef.c_str(), ofstream::out);
426
427 if (!outfile_gamma) {
428 cout << "error writing file: " << file_gamma << endl;
429 return;
430 }
431 if (!outfile_hyp) {
432 cout << "error writing file: " << file_hyp << endl;
433 return;
434 }
435 if (!outfile_coef) {
436 cout << "error writing file: " << file_coef << endl;
437 return;
438 }
439
440 outfile_hyp << "h"
441 << "\t"
442 << "rho"
443 << "\t"
444 << "sa2"
445 << "\t"
446 << "sb2"
447 << "\t"
448 << "weight" << endl;
449 outfile_hyp << scientific;
450 for (size_t i = 0; i < Hyper->size1; i++) {
451 for (size_t j = 0; j < Hyper->size2; j++) {
452 outfile_hyp << setprecision(6) << gsl_matrix_get(Hyper, i, j) << "\t";
453 }
454 outfile_hyp << endl;
455 }
456
457 outfile_gamma << "rs"
458 << "\t"
459 << "gamma" << endl;
460 for (size_t i = 0; i < vec_rs.size(); ++i) {
461 outfile_gamma << vec_rs[i] << "\t" << scientific << setprecision(6)
462 << gsl_vector_get(pip, i) << endl;
463 }
464
465 outfile_coef << "coef" << endl;
466 outfile_coef << scientific;
467 for (size_t i = 0; i < coef->size; i++) {
468 outfile_coef << setprecision(6) << gsl_vector_get(coef, i) << endl;
469 }
470
471 outfile_coef.close();
472 outfile_coef.clear();
473 outfile_hyp.close();
474 outfile_hyp.clear();
475 outfile_gamma.close();
476 outfile_gamma.clear();
477 return;
478 }
479
CalcMarginal(const gsl_vector * Uty,const gsl_vector * K_eval,const double sigma_b2,const double tau)480 double BSLMMDAP::CalcMarginal(const gsl_vector *Uty, const gsl_vector *K_eval,
481 const double sigma_b2, const double tau) {
482 gsl_vector *weight_Hi = gsl_vector_alloc(Uty->size);
483
484 double logm = 0.0;
485 double d, uy, Hi_yy = 0, logdet_H = 0.0;
486 for (size_t i = 0; i < ni_test; ++i) {
487 d = gsl_vector_get(K_eval, i) * sigma_b2;
488 d = 1.0 / (d + 1.0);
489 gsl_vector_set(weight_Hi, i, d);
490
491 logdet_H -= log(d);
492 uy = gsl_vector_get(Uty, i);
493 Hi_yy += d * uy * uy;
494 }
495
496 // Calculate likelihood.
497 logm = -0.5 * logdet_H - 0.5 * tau * Hi_yy + 0.5 * log(tau) * (double)ni_test;
498
499 gsl_vector_free(weight_Hi);
500
501 return logm;
502 }
503
CalcMarginal(const gsl_matrix * UtXgamma,const gsl_vector * Uty,const gsl_vector * K_eval,const double sigma_a2,const double sigma_b2,const double tau)504 double BSLMMDAP::CalcMarginal(const gsl_matrix *UtXgamma, const gsl_vector *Uty,
505 const gsl_vector *K_eval, const double sigma_a2,
506 const double sigma_b2, const double tau) {
507 clock_t time_start;
508 double logm = 0.0;
509 double d, uy, P_yy = 0, logdet_O = 0.0, logdet_H = 0.0;
510
511 gsl_matrix *UtXgamma_eval =
512 gsl_matrix_alloc(UtXgamma->size1, UtXgamma->size2);
513 gsl_matrix *Omega = gsl_matrix_alloc(UtXgamma->size2, UtXgamma->size2);
514 gsl_vector *XtHiy = gsl_vector_alloc(UtXgamma->size2);
515 gsl_vector *beta_hat = gsl_vector_alloc(UtXgamma->size2);
516 gsl_vector *weight_Hi = gsl_vector_alloc(UtXgamma->size1);
517
518 gsl_matrix_memcpy(UtXgamma_eval, UtXgamma);
519
520 logdet_H = 0.0;
521 P_yy = 0.0;
522 for (size_t i = 0; i < ni_test; ++i) {
523 gsl_vector_view UtXgamma_row = gsl_matrix_row(UtXgamma_eval, i);
524 d = gsl_vector_get(K_eval, i) * sigma_b2;
525 d = 1.0 / (d + 1.0);
526 gsl_vector_set(weight_Hi, i, d);
527
528 logdet_H -= log(d);
529 uy = gsl_vector_get(Uty, i);
530 P_yy += d * uy * uy;
531 gsl_vector_scale(&UtXgamma_row.vector, d);
532 }
533
534 // Calculate Omega.
535 gsl_matrix_set_identity(Omega);
536
537 time_start = clock();
538 lapack_dgemm((char *)"T", (char *)"N", sigma_a2, UtXgamma_eval, UtXgamma, 1.0,
539 Omega);
540 time_Omega += (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0);
541
542 // Calculate beta_hat.
543 gsl_blas_dgemv(CblasTrans, 1.0, UtXgamma_eval, Uty, 0.0, XtHiy);
544
545 logdet_O = CholeskySolve(Omega, XtHiy, beta_hat);
546
547 gsl_vector_scale(beta_hat, sigma_a2);
548
549 gsl_blas_ddot(XtHiy, beta_hat, &d);
550 P_yy -= d;
551
552 gsl_matrix_free(UtXgamma_eval);
553 gsl_matrix_free(Omega);
554 gsl_vector_free(XtHiy);
555 gsl_vector_free(beta_hat);
556 gsl_vector_free(weight_Hi);
557
558 logm = -0.5 * logdet_H - 0.5 * logdet_O - 0.5 * tau * P_yy +
559 0.5 * log(tau) * (double)ni_test;
560
561 return logm;
562 }
563
CalcPrior(class HYPBSLMM & cHyp)564 double BSLMMDAP::CalcPrior(class HYPBSLMM &cHyp) {
565 double logprior = 0;
566 logprior =
567 ((double)cHyp.n_gamma - 1.0) * cHyp.logp +
568 ((double)ns_test - (double)cHyp.n_gamma) * log(1.0 - exp(cHyp.logp));
569 return logprior;
570 }
571
572 // Where A is the ni_test by n_cat matrix of annotations.
DAP_CalcBF(const gsl_matrix * U,const gsl_matrix * UtX,const gsl_vector * Uty,const gsl_vector * K_eval,const gsl_vector * y)573 void BSLMMDAP::DAP_CalcBF(const gsl_matrix *U, const gsl_matrix *UtX,
574 const gsl_vector *Uty, const gsl_vector *K_eval,
575 const gsl_vector *y) {
576 clock_t time_start;
577
578 // Set up BF.
579 double tau, h, rho, sigma_a2, sigma_b2, d;
580 size_t ns_causal = 10;
581 size_t n_grid = h_ngrid * rho_ngrid;
582 vector<double> vec_sa2, vec_sb2, logm_null;
583
584 gsl_matrix *BF = gsl_matrix_alloc(ns_test, n_grid);
585 gsl_matrix *Xgamma = gsl_matrix_alloc(ni_test, 1);
586 gsl_matrix *Hyper = gsl_matrix_alloc(n_grid, 5);
587
588 // Compute tau by using yty.
589 gsl_blas_ddot(Uty, Uty, &tau);
590 tau = (double)ni_test / tau;
591
592 // Set up grid values for sigma_a2 and sigma_b2 based on an
593 // approximately even grid for h and rho, and a fixed number
594 // of causals.
595 size_t ij = 0;
596 for (size_t i = 0; i < h_ngrid; i++) {
597 h = h_min + (h_max - h_min) * (double)i / ((double)h_ngrid - 1);
598 for (size_t j = 0; j < rho_ngrid; j++) {
599 rho = rho_min + (rho_max - rho_min) * (double)j / ((double)rho_ngrid - 1);
600
601 sigma_a2 = h * rho / ((1 - h) * (double)ns_causal);
602 sigma_b2 = h * (1.0 - rho) / (trace_G * (1 - h));
603
604 vec_sa2.push_back(sigma_a2);
605 vec_sb2.push_back(sigma_b2);
606 logm_null.push_back(CalcMarginal(Uty, K_eval, 0.0, tau));
607
608 gsl_matrix_set(Hyper, ij, 0, h);
609 gsl_matrix_set(Hyper, ij, 1, rho);
610 gsl_matrix_set(Hyper, ij, 2, sigma_a2);
611 gsl_matrix_set(Hyper, ij, 3, sigma_b2);
612 gsl_matrix_set(Hyper, ij, 4, 1 / (double)n_grid);
613 ij++;
614 }
615 }
616
617 // Compute BF factors.
618 time_start = clock();
619 cout << "Calculating BF..." << endl;
620 for (size_t t = 0; t < ns_test; t++) {
621 gsl_vector_view Xgamma_col = gsl_matrix_column(Xgamma, 0);
622 gsl_vector_const_view X_col = gsl_matrix_const_column(UtX, t);
623 gsl_vector_memcpy(&Xgamma_col.vector, &X_col.vector);
624
625 for (size_t ij = 0; ij < n_grid; ij++) {
626 sigma_a2 = vec_sa2[ij];
627 sigma_b2 = vec_sb2[ij];
628
629 d = CalcMarginal(Xgamma, Uty, K_eval, sigma_a2, sigma_b2, tau);
630 d -= logm_null[ij];
631 d = exp(d);
632
633 gsl_matrix_set(BF, t, ij, d);
634 }
635 }
636 time_Proposal = (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0);
637
638 // Save results.
639 WriteResult(Hyper, BF);
640
641 // Free matrices and vectors.
642 gsl_matrix_free(BF);
643 gsl_matrix_free(Xgamma);
644 gsl_matrix_free(Hyper);
645 return;
646 }
647
single_ct_regression(const gsl_matrix_int * Xd,const gsl_vector_int * dlevel,const gsl_vector * pip_vec,gsl_vector * coef,gsl_vector * prior_vec)648 void single_ct_regression(const gsl_matrix_int *Xd,
649 const gsl_vector_int *dlevel,
650 const gsl_vector *pip_vec, gsl_vector *coef,
651 gsl_vector *prior_vec) {
652
653 map<int, double> sum_pip;
654 map<int, double> sum;
655
656 int levels = gsl_vector_int_get(dlevel, 0);
657
658 for (int i = 0; i < levels; i++) {
659 sum_pip[i] = sum[i] = 0;
660 }
661
662 for (size_t i = 0; i < Xd->size1; i++) {
663 int cat = gsl_matrix_int_get(Xd, i, 0);
664 sum_pip[cat] += gsl_vector_get(pip_vec, i);
665 sum[cat] += 1;
666 }
667
668 for (size_t i = 0; i < Xd->size1; i++) {
669 int cat = gsl_matrix_int_get(Xd, i, 0);
670 gsl_vector_set(prior_vec, i, sum_pip[cat] / sum[cat]);
671 }
672
673 for (int i = 0; i < levels; i++) {
674 double new_prior = sum_pip[i] / sum[i];
675 gsl_vector_set(coef, i, log(new_prior / (1 - new_prior)));
676 }
677
678 return;
679 }
680
681 // Where A is the ni_test by n_cat matrix of annotations.
DAP_EstimateHyper(const size_t kc,const size_t kd,const vector<string> & vec_rs,const vector<double> & vec_sa2,const vector<double> & vec_sb2,const vector<double> & wab,const vector<vector<vector<double>>> & BF,gsl_matrix * Ac,gsl_matrix_int * Ad,gsl_vector_int * dlevel)682 void BSLMMDAP::DAP_EstimateHyper(
683 const size_t kc, const size_t kd, const vector<string> &vec_rs,
684 const vector<double> &vec_sa2, const vector<double> &vec_sb2,
685 const vector<double> &wab, const vector<vector<vector<double>>> &BF,
686 gsl_matrix *Ac, gsl_matrix_int *Ad, gsl_vector_int *dlevel) {
687 // clock_t time_start;
688
689 // Set up BF.
690 double h, rho, sigma_a2, sigma_b2, d, s, logm, logm_save = nan("");
691 size_t t1, t2;
692 size_t n_grid = wab.size(), ns_test = vec_rs.size();
693
694 gsl_vector *prior_vec = gsl_vector_alloc(ns_test);
695 gsl_matrix *Hyper = gsl_matrix_alloc(n_grid, 5);
696 gsl_vector *pip = gsl_vector_alloc(ns_test);
697 gsl_vector *coef = gsl_vector_alloc(kc + kd + 1);
698
699 // Perform the EM algorithm.
700 vector<double> vec_wab, vec_wab_new;
701
702 // Initial values.
703 for (size_t t = 0; t < ns_test; t++) {
704 gsl_vector_set(prior_vec, t, (double)BF.size() / (double)ns_test);
705 }
706 for (size_t ij = 0; ij < n_grid; ij++) {
707 vec_wab.push_back(wab[ij]);
708 vec_wab_new.push_back(wab[ij]);
709 }
710
711 // EM iteration.
712 size_t it = 0;
713 double dif = 1;
714 while (it < 100 && dif > 1e-3) {
715
716 // Update E_gamma.
717 t1 = 0, t2 = 0;
718 for (size_t b = 0; b < BF.size(); b++) {
719 s = 1;
720 for (size_t m = 0; m < BF[b].size(); m++) {
721 d = 0;
722 for (size_t ij = 0; ij < n_grid; ij++) {
723 d += vec_wab_new[ij] * BF[b][m][ij];
724 }
725 d *=
726 gsl_vector_get(prior_vec, t1) / (1 - gsl_vector_get(prior_vec, t1));
727
728 gsl_vector_set(pip, t1, d);
729 s += d;
730 t1++;
731 }
732
733 for (size_t m = 0; m < BF[b].size(); m++) {
734 d = gsl_vector_get(pip, t2) / s;
735 gsl_vector_set(pip, t2, d);
736 t2++;
737 }
738 }
739
740 // Update E_wab.
741 s = 0;
742 for (size_t ij = 0; ij < n_grid; ij++) {
743 vec_wab_new[ij] = 0;
744
745 t1 = 0;
746 for (size_t b = 0; b < BF.size(); b++) {
747 d = 1;
748 for (size_t m = 0; m < BF[b].size(); m++) {
749 d += gsl_vector_get(prior_vec, t1) /
750 (1 - gsl_vector_get(prior_vec, t1)) * vec_wab[ij] * BF[b][m][ij];
751 t1++;
752 }
753 vec_wab_new[ij] += log(d);
754 }
755
756 s = max(s, vec_wab_new[ij]);
757 }
758
759 d = 0;
760 for (size_t ij = 0; ij < n_grid; ij++) {
761 vec_wab_new[ij] = exp(vec_wab_new[ij] - s);
762 d += vec_wab_new[ij];
763 }
764
765 for (size_t ij = 0; ij < n_grid; ij++) {
766 vec_wab_new[ij] /= d;
767 }
768
769 // Update coef, and pi.
770 if (kc == 0 && kd == 0) {
771
772 // No annotation.
773 s = 0;
774 for (size_t t = 0; t < pip->size; t++) {
775 s += gsl_vector_get(pip, t);
776 }
777 s = s / (double)pip->size;
778 for (size_t t = 0; t < pip->size; t++) {
779 gsl_vector_set(prior_vec, t, s);
780 }
781
782 gsl_vector_set(coef, 0, log(s / (1 - s)));
783 } else if (kc == 0 && kd != 0) {
784
785 // Only discrete annotations.
786 if (kd == 1) {
787 single_ct_regression(Ad, dlevel, pip, coef, prior_vec);
788 } else {
789 logistic_cat_fit(coef, Ad, dlevel, pip, 0, 0);
790 logistic_cat_pred(coef, Ad, dlevel, prior_vec);
791 }
792 } else if (kc != 0 && kd == 0) {
793
794 // Only continuous annotations.
795 logistic_cont_fit(coef, Ac, pip, 0, 0);
796 logistic_cont_pred(coef, Ac, prior_vec);
797 } else if (kc != 0 && kd != 0) {
798
799 // Both continuous and categorical annotations.
800 logistic_mixed_fit(coef, Ad, dlevel, Ac, pip, 0, 0);
801 logistic_mixed_pred(coef, Ad, dlevel, Ac, prior_vec);
802 }
803
804 // Compute marginal likelihood.
805 logm = 0;
806
807 t1 = 0;
808 for (size_t b = 0; b < BF.size(); b++) {
809 d = 1;
810 s = 0;
811 for (size_t m = 0; m < BF[b].size(); m++) {
812 s += log(1 - gsl_vector_get(prior_vec, t1));
813 for (size_t ij = 0; ij < n_grid; ij++) {
814 d += gsl_vector_get(prior_vec, t1) /
815 (1 - gsl_vector_get(prior_vec, t1)) * vec_wab[ij] * BF[b][m][ij];
816 }
817 }
818 logm += log(d) + s;
819 t1++;
820 }
821
822 if (it > 0) {
823 dif = logm - logm_save;
824 }
825 logm_save = logm;
826 it++;
827
828 cout << "iteration = " << it << "; marginal likelihood = " << logm << endl;
829 }
830
831 // Update h and rho that correspond to w_ab.
832 for (size_t ij = 0; ij < n_grid; ij++) {
833 sigma_a2 = vec_sa2[ij];
834 sigma_b2 = vec_sb2[ij];
835
836 d = exp(gsl_vector_get(coef, coef->size - 1)) /
837 (1 + exp(gsl_vector_get(coef, coef->size - 1)));
838 h = (d * (double)ns_test * sigma_a2 + 1 * sigma_b2) /
839 (1 + d * (double)ns_test * sigma_a2 + 1 * sigma_b2);
840 rho = d * (double)ns_test * sigma_a2 /
841 (d * (double)ns_test * sigma_a2 + 1 * sigma_b2);
842
843 gsl_matrix_set(Hyper, ij, 0, h);
844 gsl_matrix_set(Hyper, ij, 1, rho);
845 gsl_matrix_set(Hyper, ij, 2, sigma_a2);
846 gsl_matrix_set(Hyper, ij, 3, sigma_b2);
847 gsl_matrix_set(Hyper, ij, 4, vec_wab_new[ij]);
848 }
849
850 // Obtain beta and alpha parameters.
851
852 // Save results.
853 WriteResult(vec_rs, Hyper, pip, coef);
854
855 // Free matrices and vectors.
856 gsl_vector_free(prior_vec);
857 gsl_matrix_free(Hyper);
858 gsl_vector_free(pip);
859 gsl_vector_free(coef);
860 return;
861 }
862