1 /* The MIT License
2
3 Copyright (C) 2015-2021 Giulio Genovese
4
5 Author: Giulio Genovese <giulio.genovese@gmail.com>
6
7 Permission is hereby granted, free of charge, to any person obtaining a copy
8 of this software and associated documentation files (the "Software"), to deal
9 in the Software without restriction, including without limitation the rights
10 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 copies of the Software, and to permit persons to whom the Software is
12 furnished to do so, subject to the following conditions:
13
14 The above copyright notice and this permission notice shall be included in
15 all copies or substantial portions of the Software.
16
17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23 THE SOFTWARE.
24
25 */
26
27 #include <stdio.h>
28 #include <unistd.h>
29 #include <getopt.h>
30 #include <errno.h>
31 #include <ctype.h>
32 #include <math.h>
33 #include <htslib/kseq.h>
34 #include <htslib/vcf.h>
35 #include <htslib/synced_bcf_reader.h>
36 #include <htslib/ksort.h>
37 #include "regidx.h"
38 #include "kmin.h"
39 #include "mocha.h"
40 #include "genome_rules.h"
41 #include "beta_binom.h"
42 #include "bcftools.h"
43 #include "filter.h"
44 #include "tsv2vcf.h"
45
46 #define MOCHA_VERSION "2021-10-15"
47
48 /****************************************
49 * CONSTANT DEFINITIONS *
50 ****************************************/
51
52 #define SIGN(x) (((x) > 0) - ((x) < 0))
53
54 #define BDEV_LRR_BAF_DFLT "-2.0,-4.0,-6.0,10.0,6.0,4.0"
55 #define BDEV_BAF_PHASE_DFLT "6.0,8.0,10.0,15.0,20.0,30.0,50.0,80.0,100.0,150.0,200.0"
56 #define MIN_DST_DFLT "400"
57 #define ADJ_BAF_LRR_DFLT "5"
58 #define REGRESS_BAF_LRR_DFLT "15"
59 #define LRR_GC_ORDER_DFLT "2"
60 #define MAX_ORDER 5
61 #define XY_MAJOR_PL_DFLT "65.0"
62 #define XY_MINOR_PL_DFLT "35.0"
63 #define AUTO_TEL_PL_DFLT "20.0"
64 #define CHRX_TEL_PL_DFLT "8.0"
65 #define CHRY_TEL_PL_DFLT "6.0"
66 #define ERR_PL_DFLT "15.0"
67 #define FLIP_PL_DFLT "20.0"
68 #define SHORT_ARM_CHRS_DFLT "13,14,15,21,22,chr13,chr14,chr15,chr21,chr22"
69 #define LRR_BIAS_DFLT "0.2"
70 // https://www.illumina.com/documents/products/technotes/technote_cnv_algorithms.pdf
71 #define LRR_HAP2DIP_DFLT "0.45"
72
73 #define FLT_INCLUDE (1 << 0)
74 #define FLT_EXCLUDE (1 << 1)
75 #define WGS_DATA (1 << 2)
76 #define NO_LOG (1 << 3)
77 #define NO_ANNOT (1 << 4)
78 #define USE_SHORT_ARMS (1 << 5)
79 #define USE_CENTROMERES (1 << 6)
80 #define USE_NO_RULES_CHRS (1 << 7)
81
82 #define LRR 0
83 #define BAF 1
84 #define AD0 0
85 #define AD1 1
86 #define LDEV 0
87 #define BDEV 1
88
89 #define LRR_BAF 0
90 #define BAF_PHASE 1
91
92 #define MOCHA_UNDET 0
93 #define MOCHA_LOSS 1
94 #define MOCHA_GAIN 2
95 #define MOCHA_CNLOH 3
96 #define MOCHA_CNP_LOSS 4
97 #define MOCHA_CNP_GAIN 5
98 #define MOCHA_CNP_CNV 6
99
100 #define MOCHA_NOT 0
101 #define MOCHA_ARM 1
102 #define MOCHA_TEL 2
103
104 #define GT_NC 0
105 #define GT_AA 1
106 #define GT_AB 2
107 #define GT_BB 3
108
109 /****************************************
110 * DATA STRUCTURES *
111 ****************************************/
112
113 typedef struct {
114 int pos;
115 int allele_a;
116 int allele_b;
117 float adjust[3][3]; // shift
118 } locus_t;
119
120 typedef struct {
121 int allele_a_id, allele_b_id, gc_id, gt_id, ad_id, baf_id, lrr_id;
122 float xy_major_log_prb;
123 float xy_minor_log_prb;
124 float auto_tel_log_prb;
125 float chrX_tel_log_prb;
126 float chrY_tel_log_prb;
127 float err_log_prb;
128 float flip_log_prb;
129 float *bdev_lrr_baf, *bdev_baf_phase;
130 int bdev_lrr_baf_n, bdev_baf_phase_n;
131 int min_dst;
132 float lrr_cutoff;
133 float lrr_hap2dip;
134 float lrr_bias;
135 int adj_baf_lrr;
136 int regress_baf_lrr;
137 int lrr_gc_order;
138 int flags;
139 int filter_logic;
140 filter_t *filter;
141 genome_rules_t *genome_rules;
142 regidx_t *cnp_idx;
143 regitr_t *cnp_itr;
144 regidx_t *mhc_idx;
145 regidx_t *kir_idx;
146
147 int rid;
148 int n;
149 locus_t *locus_arr;
150 int m_locus;
151 float *gc_arr;
152 int m_gc;
153 int n_flipped;
154 } model_t;
155
156 typedef struct {
157 int sample_idx;
158 int computed_gender;
159 int rid;
160 int beg_pos;
161 int end_pos;
162 int length;
163 int8_t p_arm;
164 int8_t q_arm;
165 int n_sites;
166 int n_hets;
167 int n50_hets;
168 float bdev;
169 float bdev_se;
170 float ldev;
171 float ldev_se;
172 float lod_lrr_baf;
173 float lod_baf_phase;
174 int n_flips;
175 float baf_conc;
176 float lod_baf_conc;
177 int8_t type;
178 float cf;
179 } mocha_t;
180
181 typedef struct {
182 int n, m;
183 mocha_t *a;
184 } mocha_table_t;
185
186 typedef struct {
187 float call_rate;
188 float lrr_median;
189 float lrr_sd;
190 float lrr_auto;
191 float dispersion; // either rho(AD0, AD1) for WGS model or sd(BAF)
192 float baf_conc;
193 float baf_auto;
194 float lrr_gc_rel_ess;
195 float coeffs[MAX_ORDER + 1];
196 } stats_t;
197
198 typedef struct {
199 int idx;
200 int computed_gender;
201 float adjlrr_sd;
202 int n_sites;
203 int n_missing_gts;
204 int n_hets;
205 int x_nonpar_n_hets;
206 int par1_n_hets;
207 int xtr_n_hets;
208 int par2_n_hets;
209 float x_nonpar_dispersion; // either rho(AD0, AD1) for WGS model or sd(BAF)
210 float x_nonpar_lrr_median;
211 float y_nonpar_lrr_median;
212 float mt_lrr_median;
213 stats_t stats;
214 stats_t *stats_arr;
215 int m_stats, n_stats;
216
217 int n;
218 int *vcf_imap_arr;
219 int m_vcf_imap;
220 int16_t *data_arr[2];
221 int m_data[2];
222 int8_t *phase_arr;
223 int m_phase;
224 } sample_t;
225
226 /****************************************
227 * INLINE FUNCTIONS AND CONSTANTS *
228 ****************************************/
229
230 // this macro from ksort.h defines the function
231 // void ks_introsort_int(size_t n, int a[]);
KSORT_INIT_GENERIC(int)232 KSORT_INIT_GENERIC(int)
233
234 static inline float sqf(float x) { return x * x; }
sq(double x)235 static inline double sq(double x) { return x * x; }
236 // the x == y is necessary in case x == -INFINITY
log_mean_expf(float x,float y)237 static inline float log_mean_expf(float x, float y) {
238 return x == y ? x : (x > y ? x + logf(1.0f + expf(y - x)) : y + logf(1.0f + expf(x - y))) - (float)M_LN2;
239 }
240
241 static beta_binom_t *beta_binom_null, *beta_binom_alt;
beta_binom_log_lkl(const beta_binom_t * self,int16_t ad0,int16_t ad1)242 static inline float beta_binom_log_lkl(const beta_binom_t *self, int16_t ad0, int16_t ad1) {
243 return ad0 == bcf_int16_missing || ad1 == bcf_int16_missing ? 0.0f : beta_binom_log_unsafe(self, ad0, ad1);
244 }
245
246 /****************************************
247 * CONVERT FLOAT TO INT16 AND VICEVERSA *
248 ****************************************/
249
250 #define INT16_SCALE 1000 // BAF values from Illumina are scaled to 1000
251
float_to_int16(float in)252 static inline int16_t float_to_int16(float in) {
253 return isnan(in) ? bcf_int16_missing : (int16_t)roundf(INT16_SCALE * in);
254 }
255
int16_to_float(int16_t in)256 static inline float int16_to_float(int16_t in) { return in == bcf_int16_missing ? NAN : ((float)in) / INT16_SCALE; }
257
258 /******************************************
259 * LRR AND COVERAGE POLYNOMIAL REGRESSION *
260 ******************************************/
261
262 // https://stackoverflow.com/questions/5083465/fast-efficient-least-squares-fit-algorithm-in-c
263 // https://github.com/natedomin/polyfit/blob/master/polyfit.c
264 // https://software.intel.com/sites/products/documentation/doclib/mkl_sa/11/mkl_lapack_examples/sgels_ex.c.htm
265 // this function needs to use doubles internally when dealing with WGS data
polyfit(const float * lrr,const float * gc,int n,const int * imap,int order,float * coeffs)266 static int polyfit(const float *lrr, const float *gc, int n, const int *imap, int order, float *coeffs) {
267 int m = order + 1;
268 if (n < m || order > MAX_ORDER) return -1;
269 double B[MAX_ORDER + 1] = {0.0};
270 double P[((MAX_ORDER + 1) * 2) + 1] = {0.0};
271 double A[(MAX_ORDER + 1) * 2 * (MAX_ORDER + 1)] = {0.0};
272
273 // identify the column vector
274 for (int i = 0; i < n; i++) {
275 float x = imap ? gc[imap[i]] : gc[i];
276 float y = lrr[i];
277 if (isnan(x) || isnan(y)) continue;
278 float powx = 1.0f;
279
280 for (int j = 0; j < m; j++) {
281 B[j] += (double)(y * powx);
282 powx *= x;
283 }
284 }
285
286 // initialize the PowX array
287 P[0] = (float)n;
288
289 // compute the sum of the powers of X
290 for (int i = 0; i < n; i++) {
291 float x = imap ? gc[imap[i]] : gc[i];
292 if (isnan(x)) continue;
293 float powx = x;
294
295 for (int j = 1; j < ((2 * m) + 1); j++) {
296 P[j] += (double)powx;
297 powx *= x;
298 }
299 }
300
301 // initialize the reduction matrix
302 for (int i = 0; i < m; i++) {
303 for (int j = 0; j < m; j++) {
304 A[(i * (2 * m)) + j] = P[i + j];
305 }
306
307 A[(i * (2 * m)) + (i + m)] = 1.0;
308 }
309
310 // move the identity matrix portion of the redux matrix to the
311 // left side (find the inverse of the left side of the redux matrix)
312 for (int i = 0; i < m; i++) {
313 double x = A[(i * (2 * m)) + i];
314 if (x != 0) {
315 for (int k = 0; k < (2 * m); k++) {
316 A[(i * (2 * m)) + k] /= x;
317 }
318
319 for (int j = 0; j < m; j++) {
320 if (i != j) {
321 double y = A[(j * (2 * m)) + i];
322 for (int k = 0; k < (2 * m); k++) {
323 A[(j * (2 * m)) + k] -= y * A[(i * (2 * m)) + k];
324 }
325 }
326 }
327 } else {
328 // cannot work with singular matrices
329 return -1;
330 }
331 }
332
333 // calculate coefficients
334 for (int i = 0; i < m; i++) {
335 for (int j = 0; j < m; j++) {
336 double x = 0.0;
337 for (int k = 0; k < m; k++) {
338 x += (A[(i * (2 * m)) + (k + m)] * B[k]);
339 }
340 coeffs[i] = (float)x;
341 }
342 }
343
344 return 0;
345 }
346
ad_to_lrr_baf(const int16_t * ad0,const int16_t * ad1,float * lrr,float * baf,int n)347 static void ad_to_lrr_baf(const int16_t *ad0, const int16_t *ad1, float *lrr, float *baf, int n) {
348 // this function keeps a list of logarithms of integers to minimize log calls
349 static float *logf_arr = NULL;
350 static int n_logf = 0, m_logf = 0;
351 if (ad0 == NULL && ad1 == NULL && n == 0) {
352 free(logf_arr);
353 return;
354 }
355
356 for (int i = 0; i < n; i++) {
357 if (ad0[i] == bcf_int16_missing && ad1[i] == bcf_int16_missing) {
358 lrr[i] = NAN;
359 baf[i] = NAN;
360 continue;
361 }
362 int cov = (int)(ad0[i] == bcf_int16_missing ? 0 : ad0[i]) + (int)(ad1[i] == bcf_int16_missing ? 0 : ad1[i]);
363 if (cov == 0) {
364 lrr[i] = 0;
365 baf[i] = NAN;
366 } else {
367 if (cov > n_logf) {
368 hts_expand(float, cov, m_logf, logf_arr);
369 for (int j = n_logf; j < cov; j++) logf_arr[j] = logf(j + 1);
370 n_logf = cov;
371 }
372 lrr[i] = logf_arr[cov - 1];
373 baf[i] = (ad0[i] == bcf_int16_missing || ad1[i] == bcf_int16_missing) ? NAN : (float)ad1[i] / (float)cov;
374 }
375 }
376 }
377
adjust_lrr(float * lrr,const float * gc,int n,const int * imap,const float * coeffs,int order)378 static void adjust_lrr(float *lrr, const float *gc, int n, const int *imap, const float *coeffs, int order) {
379 for (int i = 0; i < n; i++) {
380 float x = imap ? gc[imap[i]] : gc[i];
381 float powx = 1.0f;
382 for (int j = 0; j <= order; j++) {
383 lrr[i] -= coeffs[j] * powx;
384 powx *= x;
385 }
386 }
387 }
388
389 // computes total sum of squares
390 // this function needs to use doubles internally when dealing with WGS data
get_tss(const float * v,int n)391 static float get_tss(const float *v, int n) {
392 double mean = 0.0;
393 int j = 0;
394 for (int i = 0; i < n; i++) {
395 if (!isnan(v[i])) {
396 mean += (double)v[i];
397 j++;
398 }
399 }
400 if (j <= 1) return NAN;
401 mean /= (double)j;
402
403 double tss = 0.0;
404 for (int i = 0; i < n; i++) {
405 if (!isnan(v[i])) tss += sq((double)v[i] - mean);
406 }
407 return (float)tss;
408 }
409
410 /*********************************
411 * HMM AND OPTIMIZATION METHODS *
412 *********************************/
413
414 // compute Viterbi path from probabilities
retrace_viterbi(int T,int N,const float * log_prb,const int8_t * ptr)415 static int8_t *retrace_viterbi(int T, int N, const float *log_prb, const int8_t *ptr) {
416 int i, t;
417 int8_t *path = (int8_t *)malloc(T * sizeof(int8_t));
418
419 // initialize last path state
420 path[T - 1] = 0;
421 for (i = 1; i < N; i++)
422 if (log_prb[(int)path[T - 1]] < log_prb[i]) path[T - 1] = (int8_t)i;
423
424 // compute best path by tracing back the Markov chain
425 for (t = T - 1; t > 0; t--) path[t - 1] = ptr[(t - 1) * N + (int)path[t]];
426
427 return path;
428 }
429
430 // rescale Viterbi log probabilities to avoid underflow issues
rescale_log_prb(float * log_prb,int n)431 static inline void rescale_log_prb(float *log_prb, int n) {
432 float max = -INFINITY;
433 for (int i = 0; i < n; i++) max = max > log_prb[i] ? max : log_prb[i];
434 for (int i = 0; i < n; i++) log_prb[i] -= max;
435 }
436
437 // compute the Viterbi path from BAF
438 // n is the length of the hidden Markov model
439 // m is the number of possible BAF deviations
log_viterbi_run(const float * emis_log_lkl,int T,int m,float xy_major_log_prb,float xy_minor_log_prb,float tel_log_prb,float flip_log_prb,int last_p,int first_q)440 static int8_t *log_viterbi_run(const float *emis_log_lkl, int T, int m, float xy_major_log_prb, float xy_minor_log_prb,
441 float tel_log_prb, float flip_log_prb, int last_p, int first_q) {
442 int t, i, j, changeidx;
443
444 // determine the number of hidden states based on whether phase information is used
445 int N = 1 + m + (isnan(flip_log_prb) ? 0 : m);
446
447 // allocate memory necessary for running the algorithm
448 float *log_prb = (float *)malloc(N * sizeof(float));
449 float *new_log_prb = (float *)malloc(N * sizeof(float));
450 int8_t *ptr = (int8_t *)malloc(N * (T - 1) * sizeof(int8_t));
451 int8_t *path;
452
453 // initialize and rescale the first state
454 log_prb[0] = emis_log_lkl[0];
455 for (i = 1; i < N; i++) log_prb[i] = tel_log_prb + emis_log_lkl[i];
456 rescale_log_prb(log_prb, N);
457
458 // compute best probabilities at each position
459 for (t = 1; t < T; t++) {
460 // this causes a penalty for mosaic chromosomal calls across the centromeres
461 float exit_log_prb = t > last_p ? xy_major_log_prb : xy_minor_log_prb;
462 float enter_log_prb = t < first_q ? xy_major_log_prb : xy_minor_log_prb;
463
464 for (i = 0; i < N; i++) {
465 new_log_prb[i] = log_prb[i];
466 ptr[(t - 1) * N + i] = (int8_t)i;
467 }
468
469 // compute whether a state switch should be considered for null state
470 for (i = 1; i < N; i++) {
471 if (new_log_prb[0] < log_prb[i] + exit_log_prb) {
472 new_log_prb[0] = log_prb[i] + exit_log_prb;
473 ptr[(t - 1) * N] = ptr[(t - 1) * N + i];
474 }
475 if (new_log_prb[i] < log_prb[0] + enter_log_prb) {
476 new_log_prb[i] = log_prb[0] + enter_log_prb;
477 ptr[(t - 1) * N + i] = ptr[(t - 1) * N];
478 }
479 }
480
481 // compute whether a state switch should be considered for each other state
482 // it will run twice if and only if phasing is used
483 for (j = 0; j == 0 || (!isnan(flip_log_prb) && j == m); j += m) {
484 float change_log_prb = log_prb[0] + enter_log_prb;
485 changeidx = 0;
486 for (i = 0; i < m; i++) {
487 if (change_log_prb < log_prb[1 + j + i] + (xy_major_log_prb + xy_minor_log_prb) * 0.75f) {
488 change_log_prb = log_prb[1 + j + i] + (xy_major_log_prb + xy_minor_log_prb) * 0.75f;
489 changeidx = 1 + j + i;
490 }
491 }
492 for (i = 0; i < m; i++) {
493 if (new_log_prb[1 + j + i] < change_log_prb) {
494 new_log_prb[1 + j + i] = change_log_prb;
495 ptr[(t - 1) * N + 1 + j + i] = ptr[(t - 1) * N + changeidx];
496 }
497 }
498 }
499
500 // compute whether a phase flip should be considered for non-null states
501 if (!isnan(flip_log_prb)) {
502 for (i = 0; i < m; i++) {
503 if (new_log_prb[1 + i] < new_log_prb[1 + m + i] + flip_log_prb) {
504 new_log_prb[1 + i] = new_log_prb[1 + m + i] + flip_log_prb;
505 ptr[(t - 1) * N + 1 + i] = ptr[(t - 1) * N + 1 + m + i];
506 }
507 if (new_log_prb[1 + m + i] < new_log_prb[1 + i] + flip_log_prb) {
508 new_log_prb[1 + m + i] = new_log_prb[1 + i] + flip_log_prb;
509 ptr[(t - 1) * N + 1 + m + i] = ptr[(t - 1) * N + 1 + i];
510 }
511 }
512 }
513
514 // update and rescale the current state
515 new_log_prb[0] += emis_log_lkl[t * N];
516 for (i = 0; i < m; i++) {
517 new_log_prb[1 + i] += emis_log_lkl[t * N + 1 + i];
518 if (!isnan(flip_log_prb)) new_log_prb[1 + m + i] += emis_log_lkl[t * N + 1 + m + i];
519 }
520 for (i = 0; i < N; i++) log_prb[i] = new_log_prb[i];
521 rescale_log_prb(log_prb, N);
522 }
523
524 // add closing cost to the last state
525 for (i = 1; i < N; i++) log_prb[i] += tel_log_prb;
526 rescale_log_prb(log_prb, N);
527
528 path = retrace_viterbi(T, N, log_prb, ptr);
529
530 // free memory
531 free(log_prb);
532 free(new_log_prb);
533 free(ptr);
534
535 // symmetrize the path
536 if (!isnan(flip_log_prb))
537 for (i = 0; i < T; i++)
538 if (path[i] > m) path[i] = (int8_t)m - path[i];
539
540 return path;
541 }
542
543 /*********************************
544 * LRR AND BAF LIKELIHOODS *
545 *********************************/
546
547 // rescale emission probabilities to avoid problems with outliers
548 // TODO find a better name for this function
rescale_emis_log_lkl(float * log_prb,int n,float err_log_prb)549 static void rescale_emis_log_lkl(float *log_prb, int n, float err_log_prb) {
550 float min_thr = -INFINITY;
551 for (int i = 0; i < n; i++)
552 if (min_thr < log_prb[i]) min_thr = log_prb[i];
553 min_thr += err_log_prb;
554 for (int i = 0; i < n; i++)
555 if (log_prb[i] < min_thr) log_prb[i] = min_thr;
556 }
557
norm_log_lkl(float x,float m,float s,float w)558 static inline float norm_log_lkl(float x, float m, float s, float w) {
559 return isnan(x) ? 0.0f : -0.5f * sqf((x - m) / s) * w;
560 }
561
562 // lrr_bias is used in a different way from what done by Petr Danecek in bcftools/vcfcnv.c
lrr_baf_log_lkl(float lrr,float baf,float ldev,float bdev,float lrr_sd,float baf_sd,float lrr_bias)563 static inline float lrr_baf_log_lkl(float lrr, float baf, float ldev, float bdev, float lrr_sd, float baf_sd,
564 float lrr_bias) {
565 return norm_log_lkl(lrr, ldev, lrr_sd, lrr_bias)
566 + log_mean_expf(norm_log_lkl(baf - 0.5f, bdev, baf_sd, 1.0f), norm_log_lkl(baf - 0.5f, -bdev, baf_sd, 1.0f));
567 }
568
baf_phase_log_lkl(float baf,int8_t phase,float bdev,float baf_sd)569 static inline float baf_phase_log_lkl(float baf, int8_t phase, float bdev, float baf_sd) {
570 return phase == 0 ? log_mean_expf(norm_log_lkl(baf - 0.5f, bdev, baf_sd, 1.0f),
571 norm_log_lkl(baf - 0.5f, -bdev, baf_sd, 1.0f))
572 : norm_log_lkl(baf - 0.5f, (float)SIGN(phase) * bdev, baf_sd, 1.0f);
573 }
574
575 // precomupute emission probabilities
lrr_baf_emis_log_lkl(const float * lrr,const float * baf,int T,const int * imap,float err_log_prb,float lrr_bias,float lrr_hap2dip,float lrr_sd,float baf_sd,const float * bdev_lrr_baf_arr,int m)576 static float *lrr_baf_emis_log_lkl(const float *lrr, const float *baf, int T, const int *imap, float err_log_prb,
577 float lrr_bias, float lrr_hap2dip, float lrr_sd, float baf_sd,
578 const float *bdev_lrr_baf_arr, int m) {
579 float *ldev = (float *)malloc(m * sizeof(float));
580 for (int i = 0; i < m; i++) ldev[i] = -logf(1.0f - 2.0f * bdev_lrr_baf_arr[i]) / (float)M_LN2 * lrr_hap2dip;
581 int N = 1 + 2 * m;
582 float *emis_log_lkl = (float *)malloc(N * T * sizeof(float));
583 for (int t = 0; t < T; t++) {
584 float x = imap ? lrr[imap[t]] : lrr[t];
585 float y = imap ? baf[imap[t]] : baf[t];
586 emis_log_lkl[t * N] = lrr_baf_log_lkl(x, y, 0.0f, 0.0f, lrr_sd, baf_sd, lrr_bias);
587 for (int i = 0; i < m; i++) {
588 emis_log_lkl[t * N + 1 + i] = lrr_baf_log_lkl(x, y, ldev[i], bdev_lrr_baf_arr[i], lrr_sd, baf_sd, lrr_bias);
589 }
590 // add states to distinguish LRR waves from true mosaic gains/losses
591 for (int i = 0; i < m; i++) {
592 if (bdev_lrr_baf_arr[i] < -1.0f / 6.0f || bdev_lrr_baf_arr[i] >= 1.0f / 6.0f)
593 emis_log_lkl[t * N + 1 + m + i] = emis_log_lkl[t * N] + err_log_prb;
594 else
595 emis_log_lkl[t * N + 1 + m + i] =
596 lrr_baf_log_lkl(x, y, bdev_lrr_baf_arr[i], 0.0f, lrr_sd, baf_sd, lrr_bias);
597 }
598 rescale_emis_log_lkl(&emis_log_lkl[t * N], N, err_log_prb);
599 }
600 free(ldev);
601 return emis_log_lkl;
602 }
603
604 // precomupute emission probabilities
baf_phase_emis_log_lkl(const float * baf,const int8_t * gt_phase,int T,const int * imap,float err_log_prb,float baf_sd,const float * bdev,int m)605 static float *baf_phase_emis_log_lkl(const float *baf, const int8_t *gt_phase, int T, const int *imap,
606 float err_log_prb, float baf_sd, const float *bdev, int m) {
607 int N = 1 + 2 * m;
608 float *emis_log_lkl = (float *)malloc(N * T * sizeof(float));
609 for (int t = 0; t < T; t++) {
610 float x = imap ? baf[imap[t]] : baf[t];
611 int8_t p = imap ? gt_phase[imap[t]] : gt_phase[t];
612 emis_log_lkl[t * N] = baf_phase_log_lkl(x, (int8_t)1, 0.0f, baf_sd);
613 for (int i = 0; i < m; i++) {
614 emis_log_lkl[t * N + 1 + i] = baf_phase_log_lkl(x, p, bdev[i], baf_sd);
615 if (p == 0)
616 emis_log_lkl[t * N + 1 + m + i] = emis_log_lkl[t * N + 1 + i];
617 else
618 emis_log_lkl[t * N + 1 + m + i] = baf_phase_log_lkl(x, p, -bdev[i], baf_sd);
619 }
620 rescale_emis_log_lkl(&emis_log_lkl[t * N], N, err_log_prb);
621 }
622 return emis_log_lkl;
623 }
624
cnp_edge_is_not_cn2_lrr_baf(const float * lrr,const float * baf,int n,int a,int b,float xy_log_prb,float err_log_prb,float lrr_bias,float lrr_hap2dip,float lrr_sd,float baf_sd,float ldev,float bdev)625 static int cnp_edge_is_not_cn2_lrr_baf(const float *lrr, const float *baf, int n, int a, int b, float xy_log_prb,
626 float err_log_prb, float lrr_bias, float lrr_hap2dip, float lrr_sd, float baf_sd,
627 float ldev, float bdev) {
628 // test left edge
629 float sum_log_lkl = 0.0f;
630 for (int i = a - 1; i >= 0; i--) {
631 float log_lkl = lrr_baf_log_lkl(lrr[i], baf[i], ldev, bdev, lrr_sd, baf_sd, lrr_bias)
632 - lrr_baf_log_lkl(lrr[i], baf[i], 0.0f, 0.0f, lrr_sd, baf_sd, lrr_bias);
633 if (!isnan(err_log_prb)) {
634 if (log_lkl < err_log_prb)
635 log_lkl = err_log_prb;
636 else if (log_lkl > -err_log_prb)
637 log_lkl = -err_log_prb;
638 }
639 sum_log_lkl += log_lkl;
640 if (sum_log_lkl > -xy_log_prb) return -1;
641 if (sum_log_lkl < xy_log_prb) break;
642 }
643
644 // test right edge
645 sum_log_lkl = 0.0f;
646 for (int i = b + 1; i < n; i++) {
647 float log_lkl = lrr_baf_log_lkl(lrr[i], baf[i], ldev, bdev, lrr_sd, baf_sd, lrr_bias)
648 - lrr_baf_log_lkl(lrr[i], baf[i], 0, 0, lrr_sd, baf_sd, lrr_bias);
649 if (!isnan(err_log_prb)) {
650 if (log_lkl < err_log_prb)
651 log_lkl = err_log_prb;
652 else if (log_lkl > -err_log_prb)
653 log_lkl = -err_log_prb;
654 }
655 sum_log_lkl += log_lkl;
656 if (sum_log_lkl > -xy_log_prb) return -1;
657 if (sum_log_lkl < xy_log_prb) break;
658 }
659
660 return 0;
661 }
662
663 // return the LOD likelihood for a segment
lrr_baf_lod(const float * lrr_arr,const float * baf_arr,int n,const int * imap,float err_log_prb,float lrr_bias,float lrr_hap2dip,float lrr_sd,float baf_sd,double bdev_lrr_baf)664 static double lrr_baf_lod(const float *lrr_arr, const float *baf_arr, int n, const int *imap, float err_log_prb,
665 float lrr_bias, float lrr_hap2dip, float lrr_sd, float baf_sd, double bdev_lrr_baf) {
666 if (n == 0 || bdev_lrr_baf < -0.5 || bdev_lrr_baf > 0.25) return -INFINITY; // kmin_brent does not handle NAN
667
668 float ldev = -logf(1.0f - 2.0f * (float)bdev_lrr_baf) / (float)M_LN2 * lrr_hap2dip;
669 float ret = 0.0f;
670 for (int i = 0; i < n; i++) {
671 float lrr = imap ? lrr_arr[imap[i]] : lrr_arr[i];
672 float baf = imap ? baf_arr[imap[i]] : baf_arr[i];
673 float log_lkl = lrr_baf_log_lkl(lrr, baf, ldev, (float)bdev_lrr_baf, lrr_sd, baf_sd, lrr_bias)
674 - lrr_baf_log_lkl(lrr, baf, 0.0f, 0.0f, lrr_sd, baf_sd, lrr_bias);
675 if (!isnan(err_log_prb)) {
676 if (log_lkl < err_log_prb)
677 log_lkl = err_log_prb;
678 else if (log_lkl > -err_log_prb)
679 log_lkl = -err_log_prb;
680 }
681 ret += log_lkl;
682 }
683 return (double)ret * M_LOG10E;
684 }
685
686 // return the LOD likelihood for a segment
baf_phase_lod(const float * baf_arr,const int8_t * gt_phase,int n,const int * imap,const int8_t * as,float err_log_prb,float baf_sd,double bdev)687 static double baf_phase_lod(const float *baf_arr, const int8_t *gt_phase, int n, const int *imap, const int8_t *as,
688 float err_log_prb, float baf_sd, double bdev) {
689 if (n == 0 || bdev < 0.0 || bdev > 0.5) return -INFINITY; // kmin_brent does not handle NAN
690
691 float ret = 0.0f;
692 for (int i = 0; i < n; i++) {
693 float baf = imap ? baf_arr[imap[i]] : baf_arr[i];
694 int8_t p = imap ? gt_phase[imap[i]] : gt_phase[i];
695 if (as) p *= (int8_t)SIGN(as[i]); // notice as has no imap
696 float log_lkl = baf_phase_log_lkl(baf, p, (float)bdev, baf_sd) - baf_phase_log_lkl(baf, 0, 0.0f, baf_sd);
697 if (!isnan(err_log_prb)) {
698 if (log_lkl < err_log_prb)
699 log_lkl = err_log_prb;
700 else if (log_lkl > -err_log_prb)
701 log_lkl = -err_log_prb;
702 }
703 ret += log_lkl;
704 }
705 return (double)ret * M_LOG10E;
706 }
707
708 typedef struct
709 {
710 const float *baf;
711 const int8_t *gt_phase;
712 int n;
713 const int *imap;
714 int8_t *path;
715 float err_log_prb;
716 float baf_sd;
717 } f1_data_t;
718
f1(double x,void * data)719 double f1(double x, void *data)
720 {
721 f1_data_t *d = data;
722
723 return -baf_phase_lod(d->baf, d->gt_phase, d->n, d->imap, d->path,
724 d->err_log_prb, d->baf_sd, x);
725 }
726
f1_pack(const float * baf,const int8_t * gt_phase,int n,const int * imap,int8_t * path,float err_log_prb,float baf_sd)727 static inline f1_data_t *f1_pack(const float *baf, const int8_t *gt_phase, int n,
728 const int *imap, int8_t *path, float err_log_prb,
729 float baf_sd)
730 {
731 // Switch to malloc() and free() if more than one object must exist at
732 // any given time
733 f1_data_t *d = malloc(sizeof(f1_data_t));
734
735 d->baf = baf;
736 d->gt_phase = gt_phase;
737 d->n = n;
738 d->imap = imap;
739 d->path = path;
740 d->err_log_prb = err_log_prb;
741 d->baf_sd = baf_sd;
742
743 return d;
744 }
745
746 // TODO find a better title for this function
compare_models(const float * baf,const int8_t * gt_phase,int n,const int * imap,float xy_log_prb,float err_log_prb,float flip_log_prb,float tel_log_prb,float baf_sd,const float * bdev,int m)747 static float compare_models(const float *baf, const int8_t *gt_phase, int n, const int *imap, float xy_log_prb,
748 float err_log_prb, float flip_log_prb, float tel_log_prb, float baf_sd, const float *bdev,
749 int m) {
750 if (n == 0) return NAN;
751 float *emis_log_lkl = baf_phase_emis_log_lkl(baf, gt_phase, n, imap, err_log_prb, baf_sd, bdev, m);
752 int8_t *path = log_viterbi_run(emis_log_lkl, n, m, xy_log_prb, xy_log_prb, tel_log_prb, flip_log_prb, 0, 0);
753 free(emis_log_lkl);
754 int n_flips = 0;
755 for (int i = 1; i < n; i++)
756 if (path[i - 1] && path[i] && path[i - 1] != path[i]) n_flips++;
757 double x;
758 f1_data_t *f1_data = f1_pack(baf, gt_phase, n, imap, path, err_log_prb,
759 baf_sd);
760 double fx = kmin_brent(f1, 0.1, 0.2, f1_data, KMIN_EPS, &x);
761 free(f1_data);
762 free(path);
763 return -(float)fx + (float)n_flips * flip_log_prb * (float)M_LOG10E;
764 }
765
baf_log_lkl(const float * baf_arr,int n,const int * imap,float baf_sd,double bdev)766 static double baf_log_lkl(const float *baf_arr, int n, const int *imap, float baf_sd, double bdev) {
767 if (n == 0 || bdev < 0.0 || bdev > 0.5) return -INFINITY; // kmin_brent does not handle NAN
768
769 double ret = 0.0;
770 for (int i = 0; i < n; i++) {
771 float baf = imap ? baf_arr[imap[i]] : baf_arr[i];
772 if (isnan(baf)) continue;
773 float log_lkl = log_mean_expf(norm_log_lkl(baf - 0.5f, (float)bdev, baf_sd, 1.0f),
774 norm_log_lkl(baf - 0.5f, -(float)bdev, baf_sd, 1.0f));
775 ret += (double)log_lkl;
776 }
777 return ret * M_LOG10E;
778 }
779
get_sample_mean(const float * v,int n,const int * imap)780 static float get_sample_mean(const float *v, int n, const int *imap) {
781 float mean = 0.0;
782 int j = 0;
783 for (int i = 0; i < n; i++) {
784 float tmp = imap ? v[imap[i]] : v[i];
785 if (!isnan(tmp)) {
786 mean += tmp;
787 j++;
788 }
789 }
790 if (j <= 1) return NAN;
791 return mean /= (float)j;
792 }
793
794 typedef struct
795 {
796 const float *baf_arr;
797 int n;
798 const int *imap;
799 float baf_sd;
800 } f7_data_t;
801
f7(double x,void * data)802 double f7(double x, void *data)
803 {
804 f7_data_t *d = data;
805
806 return -baf_log_lkl(d->baf_arr, d->n, d->imap, d->baf_sd, x);
807 }
808
f7_pack(const float * baf_arr,int n,const int * imap,float baf_sd)809 static inline f7_data_t *f7_pack(const float *baf_arr, int n, const int *imap,
810 float baf_sd)
811 {
812 f7_data_t *d = malloc(sizeof(f7_data_t));
813
814 d->baf_arr = baf_arr;
815 d->n = n;
816 d->imap = imap;
817 d->baf_sd = baf_sd;
818
819 return d;
820 }
821
get_baf_bdev(const float * baf_arr,int n,const int * imap,float baf_sd)822 static float get_baf_bdev(const float *baf_arr, int n, const int *imap, float baf_sd) {
823 double bdev = 0.0;
824 int j = 0;
825 for (int i = 0; i < n; i++) {
826 float baf = imap ? baf_arr[imap[i]] : baf_arr[i];
827 if (isnan(baf)) continue;
828 bdev += fabs((double)baf - 0.5);
829 j++;
830 }
831 if (j == 0) return NAN;
832 bdev /= j;
833 // simple method to compute bdev should work well for germline duplications
834 if ((float)bdev > 2.0f * baf_sd) return (float)bdev;
835 f7_data_t *f7_data = f7_pack(baf_arr, n, imap, baf_sd);
836 kmin_brent(f7, 0.1, 0.2, f7_data, KMIN_EPS, &bdev);
837 free(f7_data);
838 return (float)bdev < 1e-4 ? (float)NAN : (float)bdev;
839 }
840
get_max_sum(const int16_t * ad0,const int16_t * ad1,int n,const int * imap,int * n1,int * n2)841 static void get_max_sum(const int16_t *ad0, const int16_t *ad1, int n, const int *imap, int *n1, int *n2) {
842 *n1 = 0;
843 *n2 = 0;
844 for (int i = 0; i < n; i++) {
845 int a = imap ? ad0[imap[i]] : ad0[i];
846 int b = imap ? ad1[imap[i]] : ad1[i];
847 if (a != bcf_int16_missing && b != bcf_int16_missing) {
848 if (a > *n1) *n1 = a;
849 if (b > *n1) *n1 = b;
850 if (a + b > *n2) *n2 = a + b;
851 }
852 }
853 }
854
ad_log_lkl(const int16_t * ad0_arr,const int16_t * ad1_arr,int n,const int * imap,float ad_rho,double bdev)855 static double ad_log_lkl(const int16_t *ad0_arr, const int16_t *ad1_arr, int n, const int *imap, float ad_rho,
856 double bdev) {
857 if (n == 0 || bdev < 0.0 || bdev > 0.5) return -INFINITY; // kmin_brent does not handle NAN
858
859 int n1, n2;
860 get_max_sum(ad0_arr, ad1_arr, n, imap, &n1, &n2);
861 beta_binom_update(beta_binom_alt, 0.5f + (float)bdev, ad_rho, n1, n2);
862
863 double ret = 0.0;
864 for (int i = 0; i < n; i++) {
865 int16_t ad0 = imap ? ad0_arr[imap[i]] : ad0_arr[i];
866 int16_t ad1 = imap ? ad1_arr[imap[i]] : ad1_arr[i];
867 float log_lkl =
868 log_mean_expf(beta_binom_log_lkl(beta_binom_alt, ad0, ad1), beta_binom_log_lkl(beta_binom_alt, ad1, ad0));
869 ret += (double)log_lkl;
870 }
871 return (double)ret * M_LOG10E;
872 }
873
874 typedef struct
875 {
876 const int16_t *ad0_arr;
877 const int16_t *ad1_arr;
878 int n;
879 const int *imap;
880 float ad_rho;
881 } f5_data_t;
882
f5(double x,void * data)883 double f5(double x, void *data)
884 {
885 f5_data_t *d = data;
886
887 return -ad_log_lkl(d->ad0_arr, d->ad1_arr, d->n, d->imap, d->ad_rho, x);
888 }
889
f5_pack(const int16_t * ad0_arr,const int16_t * ad1_arr,int n,const int * imap,float ad_rho)890 static inline f5_data_t *f5_pack(const int16_t *ad0_arr,
891 const int16_t *ad1_arr, int n, const int *imap, float ad_rho)
892 {
893 f5_data_t *d = malloc(sizeof(f5_data_t));
894
895 d->ad0_arr = ad0_arr;
896 d->ad1_arr = ad1_arr;
897 d->n = n;
898 d->imap = imap;
899 d->ad_rho = ad_rho;
900
901 return d;
902 }
903
get_ad_bdev(const int16_t * ad0_arr,const int16_t * ad1_arr,int n,const int * imap,float ad_rho)904 static float get_ad_bdev(const int16_t *ad0_arr, const int16_t *ad1_arr, int n, const int *imap, float ad_rho) {
905 double bdev = 0.0;
906 f5_data_t *f5_data = f5_pack(ad0_arr, ad1_arr, n, imap, ad_rho);
907 kmin_brent(f5, 0.1, 0.2, f5_data, KMIN_EPS, &bdev);
908 free(f5_data);
909 return (float)bdev < 1e-4 ? (float)NAN : (float)bdev;
910 }
911
912 /*********************************
913 * WGS AD LIKELIHOODS *
914 *********************************/
915
lrr_ad_log_lkl(float lrr,int16_t ad0,int16_t ad1,float ldev,float lrr_sd,float lrr_bias,const beta_binom_t * beta_binom)916 static inline float lrr_ad_log_lkl(float lrr, int16_t ad0, int16_t ad1, float ldev, float lrr_sd, float lrr_bias,
917 const beta_binom_t *beta_binom) {
918 return norm_log_lkl(lrr, ldev, lrr_sd, lrr_bias)
919 + log_mean_expf(beta_binom_log_lkl(beta_binom, ad0, ad1), beta_binom_log_lkl(beta_binom, ad1, ad0));
920 }
921
ad_phase_log_lkl(int16_t ad0,int16_t ad1,int8_t phase,const beta_binom_t * beta_binom)922 static inline float ad_phase_log_lkl(int16_t ad0, int16_t ad1, int8_t phase, const beta_binom_t *beta_binom) {
923 return phase == 0
924 ? log_mean_expf(beta_binom_log_lkl(beta_binom, ad0, ad1), beta_binom_log_lkl(beta_binom, ad1, ad0))
925 : (phase > 0 ? beta_binom_log_lkl(beta_binom, ad0, ad1) : beta_binom_log_lkl(beta_binom, ad1, ad0));
926 }
927
lrr_ad_emis_log_lkl(const float * lrr,const int16_t * ad0,const int16_t * ad1,int T,const int * imap,float err_log_prb,float lrr_bias,float lrr_hap2dip,float lrr_sd,float ad_rho,const float * bdev_lrr_baf_arr,int m)928 static float *lrr_ad_emis_log_lkl(const float *lrr, const int16_t *ad0, const int16_t *ad1, int T, const int *imap,
929 float err_log_prb, float lrr_bias, float lrr_hap2dip, float lrr_sd, float ad_rho,
930 const float *bdev_lrr_baf_arr, int m) {
931 int N = 1 + 2 * m;
932 int n1, n2;
933 get_max_sum(ad0, ad1, T, NULL, &n1, &n2);
934 float *emis_log_lkl = (float *)malloc(N * T * sizeof(float));
935 for (int i = 0; i < 1 + m; i++) {
936 float ldev = i == 0 ? 0.0f : -logf(1.0f - 2.0f * bdev_lrr_baf_arr[i - 1]) / (float)M_LN2 * lrr_hap2dip;
937 float bdev = i == 0 ? 0.0f : fabsf(bdev_lrr_baf_arr[i - 1]);
938 beta_binom_t *beta_binom = i == 0 ? beta_binom_null : beta_binom_alt;
939 beta_binom_update(beta_binom, 0.5f + bdev, ad_rho, n1, n2);
940
941 for (int t = 0; t < T; t++) {
942 float x = imap ? lrr[imap[t]] : lrr[t];
943 int16_t a = imap ? ad0[imap[t]] : ad0[t];
944 int16_t b = imap ? ad1[imap[t]] : ad1[t];
945 emis_log_lkl[t * N + i] = lrr_ad_log_lkl(x, a, b, ldev, lrr_sd, lrr_bias, beta_binom);
946 }
947 }
948 // generate states that should attract LRR waves with no BAF signal
949 for (int i = 0; i < m; i++) {
950 // do not make extreme states compete with LRR waves
951 if (bdev_lrr_baf_arr[i] < -1.0f / 6.0f || bdev_lrr_baf_arr[i] >= 1.0f / 6.0f) {
952 for (int t = 0; t < T; t++) emis_log_lkl[t * N + 1 + m + i] = emis_log_lkl[t * N] + err_log_prb;
953 } else {
954 float ldev = -logf(1.0f - 2.0f * bdev_lrr_baf_arr[i]) / (float)M_LN2 * lrr_hap2dip;
955 for (int t = 0; t < T; t++) {
956 float x = imap ? lrr[imap[t]] : lrr[t];
957 int16_t a = imap ? ad0[imap[t]] : ad0[t];
958 int16_t b = imap ? ad1[imap[t]] : ad1[t];
959 emis_log_lkl[t * N + 1 + m + i] = lrr_ad_log_lkl(x, a, b, ldev, lrr_sd, lrr_bias, beta_binom_null);
960 }
961 }
962 }
963 for (int t = 0; t < T; t++) rescale_emis_log_lkl(&emis_log_lkl[t * N], N, err_log_prb);
964 return emis_log_lkl;
965 }
966
ad_phase_emis_log_lkl(const int16_t * ad0,const int16_t * ad1,const int8_t * gt_phase,int T,const int * imap,float err_log_prb,float ad_rho,const float * bdev_arr,int m)967 static float *ad_phase_emis_log_lkl(const int16_t *ad0, const int16_t *ad1, const int8_t *gt_phase, int T,
968 const int *imap, float err_log_prb, float ad_rho, const float *bdev_arr, int m) {
969 int N = 1 + 2 * m;
970 float *emis_log_lkl = (float *)malloc(N * T * sizeof(float));
971 for (int i = 0; i < 1 + m; i++) {
972 float bdev = i == 0 ? 0.0f : bdev_arr[i - 1];
973 // TODO this function should come out of the loop
974 int n1, n2;
975 get_max_sum(ad0, ad1, T, imap, &n1, &n2);
976 beta_binom_t *beta_binom = i == 0 ? beta_binom_null : beta_binom_alt;
977 beta_binom_update(beta_binom, 0.5f + bdev, ad_rho, n1, n2);
978
979 for (int t = 0; t < T; t++) {
980 int16_t a = imap ? ad0[imap[t]] : ad0[t];
981 int16_t b = imap ? ad1[imap[t]] : ad1[t];
982 int8_t p = imap ? gt_phase[imap[t]] : gt_phase[t];
983 emis_log_lkl[t * N + i] = ad_phase_log_lkl(a, b, p, beta_binom);
984 if (i > 0) emis_log_lkl[t * N + m + i] = ad_phase_log_lkl(b, a, p, beta_binom);
985 }
986 }
987 for (int t = 0; t < T; t++) rescale_emis_log_lkl(&emis_log_lkl[t * N], N, err_log_prb);
988 return emis_log_lkl;
989 }
990
cnp_edge_is_not_cn2_lrr_ad(const float * lrr,int16_t * ad0,int16_t * ad1,int n,int a,int b,float xy_log_prb,float err_log_prb,float lrr_bias,float lrr_hap2dip,float lrr_sd,float ad_rho,float ldev,float bdev)991 static int cnp_edge_is_not_cn2_lrr_ad(const float *lrr, int16_t *ad0, int16_t *ad1, int n, int a, int b,
992 float xy_log_prb, float err_log_prb, float lrr_bias, float lrr_hap2dip,
993 float lrr_sd, float ad_rho, float ldev, float bdev) {
994 int n1, n2;
995 get_max_sum(ad0, ad1, n, NULL, &n1, &n2);
996 beta_binom_update(beta_binom_null, 0.5f, ad_rho, n1, n2);
997 beta_binom_update(beta_binom_alt, 0.5f + bdev, ad_rho, n1, n2);
998
999 // test left edge
1000 float sum_log_lkl = 0.0f;
1001 for (int i = a - 1; i >= 0; i--) {
1002 float log_lkl = lrr_ad_log_lkl(lrr[i], ad0[i], ad1[i], ldev, lrr_sd, lrr_bias, beta_binom_alt)
1003 - lrr_ad_log_lkl(lrr[i], ad0[i], ad1[i], 0.0f, lrr_sd, lrr_bias, beta_binom_null);
1004 if (!isnan(err_log_prb)) {
1005 if (log_lkl < err_log_prb)
1006 log_lkl = err_log_prb;
1007 else if (log_lkl > -err_log_prb)
1008 log_lkl = -err_log_prb;
1009 }
1010 sum_log_lkl += log_lkl;
1011 if (sum_log_lkl > -xy_log_prb) return -1;
1012 if (sum_log_lkl < xy_log_prb) break;
1013 }
1014
1015 // test right edge
1016 sum_log_lkl = 0.0f;
1017 for (int i = b + 1; i < n; i++) {
1018 float log_lkl = lrr_ad_log_lkl(lrr[i], ad0[i], ad1[i], ldev, lrr_sd, lrr_bias, beta_binom_alt)
1019 - lrr_ad_log_lkl(lrr[i], ad0[i], ad1[i], 0.0f, lrr_sd, lrr_bias, beta_binom_null);
1020 if (!isnan(err_log_prb)) {
1021 if (log_lkl < err_log_prb)
1022 log_lkl = err_log_prb;
1023 else if (log_lkl > -err_log_prb)
1024 log_lkl = -err_log_prb;
1025 }
1026 sum_log_lkl += log_lkl;
1027 if (sum_log_lkl > -xy_log_prb) return -1;
1028 if (sum_log_lkl < xy_log_prb) break;
1029 }
1030
1031 return 0;
1032 }
1033
1034 // return the LOD likelihood for a segment
lrr_ad_lod(const float * lrr_arr,const int16_t * ad0_arr,const int16_t * ad1_arr,int n,const int * imap,float err_log_prb,float lrr_bias,float lrr_hap2dip,float lrr_sd,float ad_rho,double bdev_lrr_baf)1035 static double lrr_ad_lod(const float *lrr_arr, const int16_t *ad0_arr, const int16_t *ad1_arr, int n, const int *imap,
1036 float err_log_prb, float lrr_bias, float lrr_hap2dip, float lrr_sd, float ad_rho,
1037 double bdev_lrr_baf) {
1038 if (n == 0 || bdev_lrr_baf < -0.5 || bdev_lrr_baf > 0.25) return -INFINITY; // kmin_brent does not handle NAN
1039
1040 float ldev = -logf(1.0f - 2.0f * (float)bdev_lrr_baf) / (float)M_LN2 * lrr_hap2dip;
1041 int n1, n2;
1042 get_max_sum(ad0_arr, ad1_arr, n, imap, &n1, &n2);
1043 beta_binom_update(beta_binom_null, 0.5f, ad_rho, n1, n2);
1044 beta_binom_update(beta_binom_alt, 0.5f + (float)bdev_lrr_baf, ad_rho, n1, n2);
1045 float ret = 0.0f;
1046 for (int i = 0; i < n; i++) {
1047 float lrr = imap ? lrr_arr[imap[i]] : lrr_arr[i];
1048 int16_t ad0 = imap ? ad0_arr[imap[i]] : ad0_arr[i];
1049 int16_t ad1 = imap ? ad1_arr[imap[i]] : ad1_arr[i];
1050 float log_lkl = lrr_ad_log_lkl(lrr, ad0, ad1, ldev, lrr_sd, lrr_bias, beta_binom_alt)
1051 - lrr_ad_log_lkl(lrr, ad0, ad1, 0.0f, lrr_sd, lrr_bias, beta_binom_null);
1052 if (!isnan(err_log_prb)) {
1053 if (log_lkl < err_log_prb)
1054 log_lkl = err_log_prb;
1055 else if (log_lkl > -err_log_prb)
1056 log_lkl = -err_log_prb;
1057 }
1058 ret += log_lkl;
1059 }
1060 return (double)ret * M_LOG10E;
1061 }
1062
1063 // return the LOD likelihood for a segment
ad_phase_lod(const int16_t * ad0_arr,const int16_t * ad1_arr,const int8_t * gt_phase,int n,const int * imap,const int8_t * as,float err_log_prb,float ad_rho,double bdev)1064 static double ad_phase_lod(const int16_t *ad0_arr, const int16_t *ad1_arr, const int8_t *gt_phase, int n,
1065 const int *imap, const int8_t *as, float err_log_prb, float ad_rho, double bdev) {
1066 if (n == 0 || bdev < 0.0 || bdev > 0.5) return -INFINITY; // kmin_brent does not handle NAN
1067
1068 int n1, n2;
1069 get_max_sum(ad0_arr, ad1_arr, n, imap, &n1, &n2);
1070 beta_binom_update(beta_binom_null, 0.5f, ad_rho, n1, n2);
1071 beta_binom_update(beta_binom_alt, 0.5f + (float)bdev, ad_rho, n1, n2);
1072 float ret = 0.0f;
1073 for (int i = 0; i < n; i++) {
1074 int16_t ad0 = imap ? ad0_arr[imap[i]] : ad0_arr[i];
1075 int16_t ad1 = imap ? ad1_arr[imap[i]] : ad1_arr[i];
1076 int8_t p = imap ? gt_phase[imap[i]] : gt_phase[i];
1077 if (as) p *= (int8_t)SIGN(as[i]); // notice as has no imap
1078 float log_lkl = ad_phase_log_lkl(ad0, ad1, p, beta_binom_alt) - ad_phase_log_lkl(ad0, ad1, 0, beta_binom_null);
1079 if (!isnan(err_log_prb)) {
1080 if (log_lkl < err_log_prb)
1081 log_lkl = err_log_prb;
1082 else if (log_lkl > -err_log_prb)
1083 log_lkl = -err_log_prb;
1084 }
1085 ret += log_lkl;
1086 }
1087 return (double)ret * M_LOG10E;
1088 }
1089
1090 typedef struct
1091 {
1092 const int16_t *ad0;
1093 const int16_t *ad1;
1094 const int8_t *gt_phase;
1095 int n;
1096 const int *imap;
1097 int8_t *path;
1098 float err_log_prb;
1099 float ad_rho;
1100 } f4_data_t;
1101
f4(double x,void * data)1102 double f4(double x, void *data)
1103 {
1104 f4_data_t *d = data;
1105
1106 return -ad_phase_lod(d->ad0, d->ad1, d->gt_phase, d->n, d->imap, d->path,
1107 d->err_log_prb, d->ad_rho, x);
1108 }
1109
f4_pack(const int16_t * ad0,const int16_t * ad1,const int8_t * gt_phase,int n,const int * imap,int8_t * path,float err_log_prb,float ad_rho)1110 static inline f4_data_t *f4_pack(const int16_t *ad0, const int16_t *ad1,
1111 const int8_t *gt_phase, int n, const int *imap, int8_t *path,
1112 float err_log_prb, float ad_rho)
1113 {
1114 f4_data_t *d = malloc(sizeof(f4_data_t));
1115
1116 d->ad0 = ad0;
1117 d->ad1 = ad1;
1118 d->gt_phase = gt_phase;
1119 d->n = n;
1120 d->imap = imap;
1121 d->path = path;
1122 d->err_log_prb = err_log_prb;
1123 d->ad_rho = ad_rho;
1124
1125 return d;
1126 }
1127
1128 // TODO find a better title for this function
compare_wgs_models(const int16_t * ad0,const int16_t * ad1,const int8_t * gt_phase,int n,const int * imap,float xy_log_prb,float err_log_prb,float flip_log_prb,float tel_log_prb,float ad_rho,const float * bdev,int m)1129 static float compare_wgs_models(const int16_t *ad0, const int16_t *ad1, const int8_t *gt_phase, int n, const int *imap,
1130 float xy_log_prb, float err_log_prb, float flip_log_prb, float tel_log_prb,
1131 float ad_rho, const float *bdev, int m) {
1132 if (n == 0) return NAN;
1133 float *emis_log_lkl = ad_phase_emis_log_lkl(ad0, ad1, gt_phase, n, imap, err_log_prb, ad_rho, bdev, m);
1134 int8_t *path = log_viterbi_run(emis_log_lkl, n, m, xy_log_prb, xy_log_prb, tel_log_prb, flip_log_prb, 0,
1135 0); // TODO can I not pass these values instead of 0 0?
1136 free(emis_log_lkl);
1137 int n_flips = 0;
1138 for (int i = 1; i < n; i++)
1139 if (path[i - 1] && path[i] && path[i - 1] != path[i]) n_flips++;
1140 f4_data_t *f4_data = f4_pack(ad0, ad1, gt_phase, n, imap, path, err_log_prb, ad_rho);
1141 double x, fx = kmin_brent(f4, 0.1, 0.2, f4_data, KMIN_EPS, &x);
1142 free(f4_data);
1143 free(path);
1144 return -(float)fx + (float)n_flips * flip_log_prb * (float)M_LOG10E;
1145 }
1146
1147 // TODO change this or integrate with ad_lod
lod_lkl_beta_binomial(const int16_t * ad0_arr,const int16_t * ad1_arr,int n,const int * imap,double ad_rho)1148 static double lod_lkl_beta_binomial(const int16_t *ad0_arr, const int16_t *ad1_arr, int n, const int *imap,
1149 double ad_rho) {
1150 if (n == 0 || ad_rho <= 0.0 || ad_rho >= 1.0) return -INFINITY;
1151 float ret = 0.0f;
1152 int n1, n2;
1153 get_max_sum(ad0_arr, ad1_arr, n, imap, &n1, &n2);
1154 beta_binom_update(beta_binom_null, 0.5f, ad_rho, n1, n2);
1155 for (int i = 0; i < n; i++) {
1156 int16_t ad0 = imap ? ad0_arr[imap[i]] : ad0_arr[i];
1157 int16_t ad1 = imap ? ad1_arr[imap[i]] : ad1_arr[i];
1158 ret += beta_binom_log_lkl(beta_binom_null, ad0, ad1);
1159 }
1160 return (double)ret * M_LOG10E;
1161 }
1162
1163 /*********************************
1164 * BASIC STATISTICS FUNCTIONS *
1165 *********************************/
1166
1167 // iterator of non-NaN values
next_not_missing(const float * v,const int * imap,int n,int * i)1168 static inline float next_not_missing(const float *v, const int *imap, int n, int *i) {
1169 float x = NAN;
1170 while (*i < n) {
1171 x = imap ? v[imap[*i]] : v[*i];
1172 if (!isnan(x)) break;
1173 (*i)++;
1174 }
1175 return x;
1176 }
1177
1178 // compute BAF phase concordance for a float array with iterator
get_baf_conc(const float * baf,const int8_t * gt_phase,int n,int * conc,int * disc)1179 static void get_baf_conc(const float *baf, const int8_t *gt_phase, int n, int *conc, int *disc) {
1180 int i;
1181 float prev = NAN, next = NAN;
1182 *conc = 0, *disc = 0;
1183 for (i = 0; i < n; i++) {
1184 prev = (baf[i] - 0.5f) * (float)gt_phase[i];
1185 if (!isnan(prev) && prev != 0.0f) break;
1186 }
1187 if (i == n) return;
1188
1189 for (i++; i < n; i++) {
1190 next = (baf[i] - 0.5f) * (float)gt_phase[i];
1191 if (!isnan(next) && next != 0.0f) {
1192 if (prev * next > 0.0f)
1193 (*conc)++;
1194 else if (prev * next < 0.0f)
1195 (*disc)++;
1196 prev = next;
1197 }
1198 }
1199 }
1200
1201 // compute phased BAF autocorrelation for a float array with iterator
get_baf_auto_corr(const float * baf,const int8_t * gt_phase,int n)1202 static float get_baf_auto_corr(const float *baf, const int8_t *gt_phase, int n) {
1203 double var = 0.0, auto_corr = 0.0;
1204 float prev = NAN, next = NAN;
1205 for (int i = 0; i < n; i++) {
1206 next = (baf[i] - 0.5f) * (float)gt_phase[i];
1207 if (!isnan(next)) {
1208 var += sq((double)next);
1209 if (!isnan(prev)) auto_corr += prev * next;
1210 prev = next;
1211 }
1212 }
1213 auto_corr /= var;
1214 return auto_corr;
1215 }
1216
1217 // compute sample standard deviation of a float array (with iterator)
1218 // sqrt ( ( \sum x^2 - (\sum x)^2 / N ) / ( N - 1 ) )
get_sample_sd(const float * v,int n,const int * imap)1219 static float get_sample_sd(const float *v, int n, const int *imap) {
1220 float mean = get_sample_mean(v, n, imap);
1221 float s2 = 0.0;
1222 int j = 0;
1223 for (int i = 0; i < n; i++) {
1224 float tmp = imap ? v[imap[i]] : v[i];
1225 if (!isnan(tmp)) {
1226 s2 += sq(tmp - mean);
1227 j++;
1228 }
1229 }
1230 s2 /= (double)(j - 1);
1231 return (float)sqrt(s2);
1232 }
1233
1234 // compute standard error of mean a float array (with iterator)
1235 // sqrt ( ( \sum x^2 - (\sum x)^2 / N ) / ( N - 1 ) / N )
get_se_mean(const float * v,int n,const int * imap)1236 static float get_se_mean(const float *v, int n, const int *imap) {
1237 int j = 0;
1238 for (int i = 0; i < n; i++) {
1239 float tmp = imap ? v[imap[i]] : v[i];
1240 if (!isnan(tmp)) j++;
1241 }
1242 if (j <= 1) return NAN;
1243
1244 return get_sample_sd(v, n, imap) / sqrtf(j);
1245 }
1246
1247 // compute (adjusted) LRR autocorrelation for a float array with iterator
get_lrr_auto_corr(const float * lrr,int n,const int * imap)1248 static float get_lrr_auto_corr(const float *lrr, int n, const int *imap) {
1249 float value;
1250 double mean = 0.0;
1251 int i = 0, j = 0;
1252 for (value = next_not_missing(lrr, imap, n, &i); i < n; i++, value = next_not_missing(lrr, imap, n, &i)) {
1253 mean += (double)value;
1254 j++;
1255 }
1256 if (j <= 1) return NAN;
1257 mean /= (double)j;
1258
1259 double var = 0.0;
1260 i = 0;
1261 for (value = next_not_missing(lrr, imap, n, &i); i < n; i++, value = next_not_missing(lrr, imap, n, &i)) {
1262 var += sq((double)value - mean);
1263 }
1264
1265 double auto_corr = 0.0;
1266 i = 0;
1267 double prev = (double)next_not_missing(lrr, imap, n, &i) - mean, next;
1268 for (i++, value = next_not_missing(lrr, imap, n, &i); i < n; i++, value = next_not_missing(lrr, imap, n, &i)) {
1269 next = (double)value - mean;
1270 auto_corr += prev * next;
1271 prev = next;
1272 }
1273 auto_corr /= var;
1274 return auto_corr;
1275 }
1276
1277 // compute the n50 of the call when split at the heterozygous sites
get_n50_hets(const int * v,const float * baf,int n,int beg_pos,int end_pos)1278 static int get_n50_hets(const int *v, const float *baf, int n, int beg_pos, int end_pos) {
1279 int *w = (int *)malloc((n + 1) * sizeof(int));
1280 int i, j, sum, prev = beg_pos;
1281 for (i = 0, j = 0; i < n; i++) {
1282 if (!isnan(baf[i])) {
1283 w[j++] = v[i] - prev;
1284 prev = v[i];
1285 }
1286 }
1287 w[j++] = end_pos - prev;
1288
1289 ks_introsort_int((size_t)j, w);
1290
1291 for (i = 0, sum = 0; sum << 1 < end_pos - beg_pos && i < j; i++) sum += w[i];
1292 int n50 = w[i - 1];
1293 free(w);
1294 return n50;
1295 }
1296
1297 /*********************************
1298 * SAMPLE METHODS *
1299 *********************************/
1300
mocha_print_ucsc(FILE * restrict stream,const mocha_t * mocha,int n,const bcf_hdr_t * hdr)1301 static void mocha_print_ucsc(FILE *restrict stream, const mocha_t *mocha, int n, const bcf_hdr_t *hdr) {
1302 if (stream == NULL) return;
1303 const char *name[4];
1304 name[MOCHA_UNDET] = "mCA_undetermined";
1305 name[MOCHA_LOSS] = "mCA_loss";
1306 name[MOCHA_GAIN] = "mCA_gain";
1307 name[MOCHA_CNLOH] = "mCA_neutral";
1308 const char *desc[4];
1309 desc[MOCHA_UNDET] = "Undetermined";
1310 desc[MOCHA_LOSS] = "Losses";
1311 desc[MOCHA_GAIN] = "Gains";
1312 desc[MOCHA_CNLOH] = "CN-LOHs";
1313 kstring_t tmp = {0, 0, NULL};
1314 for (int i = 0; i < 4; i++) {
1315 fprintf(stream, "track name=%s description=\"%s\" visibility=4 priority=1 itemRgb=\"On\"\n", name[i], desc[i]);
1316 for (int j = 0; j < n; j++) {
1317 uint8_t red = 127, green = 127, blue = 127;
1318 switch (i) {
1319 case MOCHA_LOSS:
1320 red -= (uint8_t)(127.0f * sqf(mocha[j].cf));
1321 green -= (uint8_t)(127.0f * sqf(mocha[j].cf));
1322 blue += (uint8_t)(128.0f * sqf(mocha[j].cf));
1323 break;
1324 case MOCHA_GAIN:
1325 red += (uint8_t)(128.0f * sqf(mocha[j].cf));
1326 green -= (uint8_t)(127.0f * sqf(mocha[j].cf));
1327 blue -= (uint8_t)(127.0f * sqf(mocha[j].cf));
1328 break;
1329 case MOCHA_CNLOH:
1330 red += (uint8_t)(128.0f * mocha[j].cf);
1331 green += (uint8_t)(38.0f * mocha[j].cf);
1332 blue -= (uint8_t)(127.0f * mocha[j].cf);
1333 break;
1334 }
1335 if (i == mocha[j].type) {
1336 const char *sample_name = bcf_hdr_int2id(hdr, BCF_DT_SAMPLE, mocha[j].sample_idx);
1337 tmp.l = 0;
1338 kputs(sample_name, &tmp);
1339 for (char *p = tmp.s; (p = strchr(p, ' ')); ++p) *p = '_';
1340 const char *seq_name = bcf_hdr_id2name(hdr, mocha[j].rid);
1341 if (strncmp(seq_name, "chr", 3) == 0)
1342 fprintf(stream, "%s\t%d\t%d\t%s\t%d\t.\t%d\t%d\t%d,%d,%d\n", seq_name, mocha[j].beg_pos,
1343 mocha[j].end_pos, tmp.s, isnan(mocha[j].cf) ? 0 : (int)(1e3 * mocha[j].cf),
1344 mocha[j].beg_pos, mocha[j].end_pos, red, green, blue);
1345 else
1346 fprintf(stream, "chr%s\t%d\t%d\t%s\t%d\t.\t%d\t%d\t%d,%d,%d\n", seq_name, mocha[j].beg_pos,
1347 mocha[j].end_pos, tmp.s, isnan(mocha[j].cf) ? 0 : (int)(1e3 * mocha[j].cf),
1348 mocha[j].beg_pos, mocha[j].end_pos, red, green, blue);
1349 }
1350 }
1351 }
1352 free(tmp.s);
1353 if (stream != stdout && stream != stderr) fclose(stream);
1354 }
1355
mocha_print_calls(FILE * restrict stream,const mocha_t * mocha,int n,const bcf_hdr_t * hdr,int flags,char * genome,float lrr_hap2dip)1356 static void mocha_print_calls(FILE *restrict stream, const mocha_t *mocha, int n, const bcf_hdr_t *hdr, int flags,
1357 char *genome, float lrr_hap2dip) {
1358 if (stream == NULL) return;
1359 char gender[3];
1360 gender[GENDER_UNKNOWN] = 'U';
1361 gender[GENDER_MALE] = 'M';
1362 gender[GENDER_FEMALE] = 'F';
1363 const char *type[6];
1364 type[MOCHA_UNDET] = "Undetermined";
1365 type[MOCHA_LOSS] = "Loss";
1366 type[MOCHA_GAIN] = "Gain";
1367 type[MOCHA_CNLOH] = "CN-LOH";
1368 type[MOCHA_CNP_LOSS] = "CNP_Loss";
1369 type[MOCHA_CNP_GAIN] = "CNP_Gain";
1370 char arm_type[3];
1371 arm_type[MOCHA_NOT] = 'N';
1372 arm_type[MOCHA_ARM] = 'Y';
1373 arm_type[MOCHA_TEL] = 'T';
1374 fputs("sample_id", stream);
1375 fputs("\tcomputed_gender", stream);
1376 fputs("\tchrom", stream);
1377 fprintf(stream, "\tbeg_%s", genome);
1378 fprintf(stream, "\tend_%s", genome);
1379 fputs("\tlength", stream);
1380 fputs("\tp_arm", stream);
1381 fputs("\tq_arm", stream);
1382 fputs("\tn_sites", stream);
1383 fputs("\tn_hets", stream);
1384 fputs("\tn50_hets", stream);
1385 fputs("\tbdev", stream);
1386 fputs("\tbdev_se", stream);
1387 fputs("\trel_cov", stream);
1388 fputs("\trel_cov_se", stream);
1389 fputs("\tlod_lrr_baf", stream);
1390 fputs("\tlod_baf_phase", stream);
1391 fputs("\tn_flips", stream);
1392 fputs("\tbaf_conc", stream);
1393 fputs("\tlod_baf_conc", stream);
1394 fputs("\ttype", stream);
1395 fputs("\tcf", stream);
1396 fputc('\n', stream);
1397 for (int i = 0; i < n; i++) {
1398 fputs(bcf_hdr_int2id(hdr, BCF_DT_SAMPLE, mocha->sample_idx), stream);
1399 fprintf(stream, "\t%c", gender[mocha->computed_gender]);
1400 fprintf(stream, "\t%s", bcf_hdr_id2name(hdr, mocha->rid));
1401 fprintf(stream, "\t%d", mocha->beg_pos);
1402 fprintf(stream, "\t%d", mocha->end_pos);
1403 fprintf(stream, "\t%d", mocha->length);
1404 fprintf(stream, "\t%c", arm_type[mocha->p_arm]);
1405 fprintf(stream, "\t%c", arm_type[mocha->q_arm]);
1406 fprintf(stream, "\t%d", mocha->n_sites);
1407 fprintf(stream, "\t%d", mocha->n_hets);
1408 fprintf(stream, "\t%d", mocha->n50_hets);
1409 fprintf(stream, "\t%.4f", mocha->bdev);
1410 fprintf(stream, "\t%.4f", mocha->bdev_se);
1411 fprintf(stream, "\t%.4f", 2.0f * expf(mocha->ldev / lrr_hap2dip * (float)M_LN2));
1412 fprintf(stream, "\t%.4f",
1413 2.0f * expf(mocha->ldev / lrr_hap2dip * (float)M_LN2) * mocha->ldev_se / lrr_hap2dip * (float)M_LN2);
1414 fprintf(stream, "\t%.2f", mocha->lod_lrr_baf);
1415 fprintf(stream, "\t%.2f", mocha->lod_baf_phase);
1416 fprintf(stream, "\t%d", mocha->n_flips);
1417 fprintf(stream, "\t%.4f", mocha->baf_conc);
1418 fprintf(stream, "\t%.2f", mocha->lod_baf_conc);
1419 fprintf(stream, "\t%s", type[mocha->type]);
1420 fprintf(stream, "\t%.4f", mocha->cf);
1421 fputc('\n', stream);
1422 mocha++;
1423 }
1424 if (stream != stdout && stream != stderr) fclose(stream);
1425 }
1426
1427 // this function returns two values (a, b) such that:
1428 // (i) a <= b (ii) pos[a-1] < beg (iii) beg <= pos[a] < end (iv) beg <= pos[b] < end (v)
1429 // pos[b+1] >= end
get_cnp_edges(const int * pos,int n,int beg,int end,int * a,int * b)1430 static int get_cnp_edges(const int *pos, int n, int beg, int end, int *a, int *b) {
1431 if (pos[0] >= end || pos[n - 1] < beg) return -1;
1432
1433 int i = 0, j = n - 1, k;
1434 while (j - i > 1) {
1435 k = (i + j) / 2;
1436 if (pos[k] < beg)
1437 i = k;
1438 else
1439 j = k;
1440 }
1441 if (pos[j] >= end) return -1;
1442
1443 *a = j;
1444
1445 i = j;
1446 j = n - 1;
1447 while (j - i > 1) {
1448 k = (i + j) / 2;
1449 if (pos[k] < end)
1450 i = k;
1451 else
1452 j = k;
1453 }
1454
1455 *b = i;
1456 return 0;
1457 }
1458
1459 // classify mosaic chromosomal alteration type based on LRR and BAF
1460 // LDEV = -log2( 1 - 2 x BDEV ) * LRR-hap2dip for gains
1461 // LDEV = -log2( 1 + 2 x BDEV ) * LRR-hap2dip for losses
mocha_type(float ldev,float ldev_se,float bdev,float bdev_se,int n_hets,float lrr_hap2dip,int8_t p_arm,int8_t q_arm)1462 static int8_t mocha_type(float ldev, float ldev_se, float bdev, float bdev_se, int n_hets, float lrr_hap2dip,
1463 int8_t p_arm, int8_t q_arm) {
1464 // a LOD score can be computed from a chi-squared statistic by dividing by 2ln(10) ~ 4.6
1465 float z2_cnloh = sqf(ldev / ldev_se);
1466 // equivalent of a 2 LOD score bonus for ending in one but not two telomeres
1467 if ((p_arm != MOCHA_TEL && q_arm != MOCHA_TEL) || (p_arm == MOCHA_TEL && q_arm == MOCHA_TEL))
1468 z2_cnloh += 4.0f * M_LN10;
1469 else
1470 z2_cnloh -= 4.0f * M_LN10;
1471
1472 // if one model has 4 LOD scores point more than the other model, select the better model
1473 if (ldev > 0) {
1474 if (n_hets < 5 || isnan(bdev)) {
1475 if (!isnan(bdev_se) && z2_cnloh < 8.0 * M_LN10)
1476 return MOCHA_CNLOH;
1477 else
1478 return MOCHA_GAIN;
1479 } else {
1480 float expected_ldev =
1481 -logf(1.0f - 2.0f * bdev > 2.0f / 3.0f ? 1.0f - 2.0f * bdev : 2.0f / 3.0f) * M_LOG2E * lrr_hap2dip;
1482 float z2_gain = sqf((ldev - expected_ldev) / ldev_se);
1483 if (z2_cnloh > z2_gain + 8.0f * M_LN10) return MOCHA_GAIN;
1484 if (z2_gain > z2_cnloh + 8.0f * M_LN10) return MOCHA_CNLOH;
1485 }
1486 } else {
1487 if (n_hets < 5 || isnan(bdev)) {
1488 if (!isnan(bdev_se) && z2_cnloh < 8.0 * M_LN10)
1489 return MOCHA_CNLOH;
1490 else
1491 return MOCHA_LOSS;
1492 } else {
1493 float expected_ldev = -logf(1.0f + 2.0f * bdev) * M_LOG2E * lrr_hap2dip;
1494 float z2_loss = sqf((ldev - expected_ldev) / ldev_se);
1495 if (z2_cnloh > z2_loss + 8.0f * M_LN10) return MOCHA_LOSS;
1496 if (z2_loss > z2_cnloh + 8.0f * M_LN10) return MOCHA_CNLOH;
1497 }
1498 }
1499 return MOCHA_UNDET;
1500 }
1501
1502 // best estimate for cell fraction using the following formula (if BDEV is available):
1503 // BDEV = | 1 / 2 - 1 / CNF |
1504 // CNF = 2 / ( 1 + 2 x BDEV ) for losses
1505 // CNF = 2 / ( 1 - 2 x BDEV ) for gains
1506 // CNF = 2 x 2^( LDEV / LRR-hap2dip )
1507 // LDEV = - LRR-hap2dip / ln(2) * ln( 1 + 2 x BDEV ) for losses
1508 // LDEV = - LRR-hap2dip / ln(2) * ln( 1 - 2 x BDEV ) for gains
mocha_cell_fraction(float ldev,float ldev_se,float bdev,int n_hets,int8_t type,float lrr_hap2dip)1509 static float mocha_cell_fraction(float ldev, float ldev_se, float bdev, int n_hets, int8_t type, float lrr_hap2dip) {
1510 if (n_hets < 5 || isnan(bdev)) {
1511 switch (type) {
1512 case MOCHA_LOSS:
1513 return ldev < -lrr_hap2dip ? 1.0f : -2.0f * (expf(ldev / lrr_hap2dip * (float)M_LN2) - 1.0f);
1514 case MOCHA_GAIN:
1515 return ldev * (float)M_LN2 > lrr_hap2dip * logf(1.5f)
1516 ? 1.0f
1517 : 2.0f * (expf(ldev / lrr_hap2dip * (float)M_LN2) - 1.0f);
1518 default:
1519 return NAN;
1520 }
1521 } else {
1522 switch (type) {
1523 case MOCHA_LOSS:
1524 return 4.0f * bdev / (1.0f + 2.0f * bdev);
1525 case MOCHA_GAIN:
1526 return bdev > 1.0f / 6.0f ? 1.0f : 4.0f * bdev / (1.0f - 2.0f * bdev);
1527 case MOCHA_CNLOH:
1528 return 2.0f * bdev;
1529 case MOCHA_UNDET:
1530 return bdev < 0.05f ? 4.0f * bdev : NAN; // here it assumes it is either a loss or a gain
1531 default:
1532 return NAN;
1533 }
1534 }
1535 }
1536
get_mocha_stats(const int * pos,const float * lrr,const float * baf,const int8_t * gt_phase,int n,int a,int b,int cen_beg,int cen_end,int length,float baf_conc,mocha_t * mocha)1537 static void get_mocha_stats(const int *pos, const float *lrr, const float *baf, const int8_t *gt_phase, int n, int a,
1538 int b, int cen_beg, int cen_end, int length, float baf_conc, mocha_t *mocha) {
1539 mocha->n_sites = b + 1 - a;
1540
1541 if (a == 0)
1542 if (pos[a] < cen_beg)
1543 mocha->beg_pos = 0;
1544 else
1545 mocha->beg_pos = cen_end;
1546 else
1547 mocha->beg_pos = pos[a];
1548
1549 if (b == n - 1)
1550 if (pos[b] >= cen_end)
1551 mocha->end_pos = length;
1552 else
1553 mocha->end_pos = cen_beg;
1554 else
1555 mocha->end_pos = pos[b];
1556
1557 mocha->length = mocha->end_pos - mocha->beg_pos;
1558
1559 if (mocha->beg_pos == 0)
1560 mocha->p_arm = MOCHA_TEL;
1561 else if (mocha->beg_pos < cen_beg)
1562 mocha->p_arm = MOCHA_ARM;
1563 else
1564 mocha->p_arm = MOCHA_NOT;
1565
1566 if (mocha->end_pos == length)
1567 mocha->q_arm = MOCHA_TEL;
1568 else if (mocha->end_pos > cen_end)
1569 mocha->q_arm = MOCHA_ARM;
1570 else
1571 mocha->q_arm = MOCHA_NOT;
1572
1573 mocha->ldev_se = get_se_mean(lrr + a, b + 1 - a, NULL);
1574 mocha->n_hets = 0;
1575 for (int i = a; i <= b; i++)
1576 if (!isnan(baf[i])) mocha->n_hets++;
1577 int conc, disc;
1578 get_baf_conc(baf + a, gt_phase + a, b + 1 - a, &conc, &disc);
1579 mocha->baf_conc = conc + disc > 0 ? (float)conc / (float)(conc + disc) : NAN;
1580 mocha->lod_baf_conc = ((mocha->baf_conc > 0 ? (float)conc * logf(mocha->baf_conc / baf_conc) : 0)
1581 + (mocha->baf_conc < 1 ? (float)disc * logf((1 - mocha->baf_conc) / (1 - baf_conc)) : 0))
1582 * (float)M_LOG10E;
1583 mocha->n50_hets = get_n50_hets(pos + a, baf + a, b + 1 - a, mocha->beg_pos, mocha->end_pos);
1584 mocha->n_flips = -1;
1585 mocha->bdev = NAN;
1586 mocha->bdev_se = NAN;
1587 mocha->lod_baf_phase = NAN;
1588 }
1589
1590 // return segments called by the HMM or a suggestion of what state should be added to the HMM
get_path_segs(const int8_t * path,const float * hs_arr,int n,int hmm_model,int middle,int ** beg,int * m_beg,int ** end,int * m_end,int * nseg)1591 static float get_path_segs(const int8_t *path, const float *hs_arr, int n, int hmm_model, int middle, int **beg,
1592 int *m_beg, int **end, int *m_end, int *nseg) {
1593 int a = 0, b = 0;
1594 *nseg = 0;
1595 for (b = 0; b < n; b++) {
1596 // check whether it is the end of a segment
1597 if (b != n - 1) {
1598 int x = abs(path[b]);
1599 int y = abs(path[b + 1]);
1600 if (x == y) continue;
1601
1602 // swap the two elements
1603 if (x > y) {
1604 x = abs(path[b + 1]);
1605 y = abs(path[b]);
1606 }
1607
1608 if (y - x == 1) {
1609 if (hmm_model == LRR_BAF && x > 0 && x != middle)
1610 return (hs_arr[x - 1] + hs_arr[y - 1]) * 0.5f;
1611 else if (hmm_model == BAF_PHASE)
1612 return x == 0 ? hs_arr[0] * 0.5f : (hs_arr[x - 1] + hs_arr[y - 1]) * 0.5f;
1613 }
1614 }
1615
1616 if (path[b]) {
1617 (*nseg)++;
1618 hts_expand(int, *nseg, *m_beg, *beg);
1619 (*beg)[(*nseg) - 1] = a;
1620 hts_expand(int, *nseg, *m_end, *end);
1621 (*end)[(*nseg) - 1] = b;
1622 }
1623 a = b + 1;
1624 }
1625 return 0;
1626 }
1627
1628 typedef struct
1629 {
1630 float *lrr;
1631 int a;
1632 int16_t *ad0;
1633 int16_t *ad1;
1634 mocha_t *mocha;
1635 const model_t *model;
1636 sample_t *self;
1637 float *baf;
1638 } f3_data_t;
1639
f3(double x,void * data)1640 double f3(double x, void *data)
1641 {
1642 f3_data_t *d = data;
1643
1644 if (d->model->flags & WGS_DATA)
1645 return -lrr_ad_lod(d->lrr + d->a, d->ad0 + d->a, d->ad1 + d->a,
1646 d->mocha->n_sites, NULL, NAN, d->model->lrr_bias,
1647 d->model->lrr_hap2dip, d->self->adjlrr_sd,
1648 d->self->stats.dispersion, x);
1649 else
1650 return -lrr_baf_lod(d->lrr + d->a, d->baf + d->a, d->mocha->n_sites,
1651 NULL, NAN, d->model->lrr_bias, d->model->lrr_hap2dip,
1652 d->self->adjlrr_sd, d->self->stats.dispersion, x);
1653 }
1654
f3_pack(float * lrr,int a,int16_t * ad0,int16_t * ad1,mocha_t * mocha,const model_t * model,sample_t * self,float * baf)1655 static inline f3_data_t *f3_pack(float *lrr, int a, int16_t *ad0, int16_t *ad1,
1656 mocha_t *mocha, const model_t *model, sample_t *self, float *baf)
1657
1658 {
1659 f3_data_t *d = malloc(sizeof(f3_data_t));
1660
1661 d->lrr = lrr;
1662 d->a = a;
1663 d->ad0 = ad0;
1664 d->ad1 = ad1;
1665 d->mocha = mocha;
1666 d->model = model;
1667 d->self = self;
1668 d->baf = baf;
1669
1670 return d;
1671 }
1672
1673 typedef struct
1674 {
1675 int16_t *ad0;
1676 int16_t *ad1;
1677 int8_t *gt_phase;
1678 mocha_t *mocha;
1679 int *imap_arr;
1680 int *beg;
1681 int i;
1682 int8_t *path;
1683 sample_t *self;
1684 } f6_data_t;
1685
f6(double x,void * data)1686 double f6(double x, void *data)
1687 {
1688 f6_data_t *d = data;
1689
1690 return -ad_phase_lod(d->ad0, d->ad1, d->gt_phase, d->mocha->n_hets,
1691 d->imap_arr + d->beg[d->i], d->path + d->beg[d->i], NAN, d->self->stats.dispersion, x);
1692 }
1693
f6_pack(int16_t * ad0,int16_t * ad1,int8_t * gt_phase,mocha_t * mocha,int * imap_arr,int * beg,int i,int8_t * path,sample_t * self)1694 static inline f6_data_t *f6_pack(int16_t *ad0, int16_t *ad1, int8_t *gt_phase,
1695 mocha_t *mocha, int *imap_arr, int *beg, int i, int8_t *path, sample_t *self)
1696
1697 {
1698 f6_data_t *d = malloc(sizeof(f6_data_t));
1699
1700 d->ad0 = ad0;
1701 d->ad1 = ad1;
1702 d->gt_phase = gt_phase;
1703 d->mocha = mocha;
1704 d->imap_arr = imap_arr;
1705 d->beg = beg;
1706 d->i = i;
1707 d->path = path;
1708 d->self = self;
1709
1710 return d;
1711 }
1712
1713 typedef struct
1714 {
1715 float *baf;
1716 int8_t *gt_phase;
1717 mocha_t *mocha;
1718 int *imap_arr;
1719 int *beg;
1720 int i;
1721 int8_t *path;
1722 sample_t *self;
1723 } f8_data_t;
1724
f8(double x,void * data)1725 double f8(double x, void *data)
1726 {
1727 f8_data_t *d = data;
1728
1729 return -baf_phase_lod(d->baf, d->gt_phase, d->mocha->n_hets,
1730 d->imap_arr + d->beg[d->i], d->path + d->beg[d->i], NAN,
1731 d->self->stats.dispersion, x);
1732 }
1733
f8_pack(float * baf,int8_t * gt_phase,mocha_t * mocha,int * imap_arr,int * beg,int i,int8_t * path,sample_t * self)1734 static inline f8_data_t *f8_pack(float *baf, int8_t *gt_phase, mocha_t *mocha,
1735 int *imap_arr, int *beg, int i, int8_t *path, sample_t *self)
1736 {
1737 f8_data_t *d = malloc(sizeof(f8_data_t));
1738
1739 d->baf = baf;
1740 d->gt_phase = gt_phase;
1741 d->mocha = mocha;
1742 d->imap_arr = imap_arr;
1743 d->beg = beg;
1744 d->i = i;
1745 d->path = path;
1746 d->self = self;
1747
1748 return d;
1749 }
1750
1751 // process one contig for one sample
sample_run(sample_t * self,mocha_table_t * mocha_table,const model_t * model)1752 static void sample_run(sample_t *self, mocha_table_t *mocha_table, const model_t *model) {
1753 // do nothing if chromosome Y or MT are being tested
1754 if (model->rid == model->genome_rules->y_rid || model->rid == model->genome_rules->mt_rid) {
1755 memset(self->data_arr[LDEV], 0, self->n * sizeof(int16_t));
1756 memset(self->data_arr[BDEV], 0, self->n * sizeof(int16_t));
1757 memset(self->phase_arr, 0, self->n * sizeof(int8_t));
1758 return;
1759 }
1760
1761 mocha_t mocha;
1762 mocha.sample_idx = self->idx;
1763 mocha.computed_gender = self->computed_gender;
1764 mocha.rid = model->rid;
1765
1766 int cen_beg = model->genome_rules->cen_beg[model->rid];
1767 int cen_end = model->genome_rules->cen_end[model->rid];
1768 int length = model->genome_rules->length[model->rid];
1769 if (length == 0) length = model->locus_arr[model->n - 1].pos;
1770 // incentive to extend to the telomere
1771 float tel_log_prb = model->rid == model->genome_rules->x_rid
1772 ? (self->computed_gender == GENDER_MALE ? model->chrY_tel_log_prb : model->chrX_tel_log_prb)
1773 : model->auto_tel_log_prb;
1774
1775 // declutter code by copying these values onto the stack
1776 int n = self->n;
1777 int8_t *gt_phase = self->phase_arr;
1778 int16_t *ad0 = self->data_arr[AD0];
1779 int16_t *ad1 = self->data_arr[AD1];
1780 float *lrr = (float *)malloc(n * sizeof(float));
1781 float *baf = (float *)malloc(n * sizeof(float));
1782 if (model->flags & WGS_DATA) {
1783 ad_to_lrr_baf(ad0, ad1, lrr, baf, n);
1784 } else {
1785 for (int i = 0; i < n; i++) {
1786 lrr[i] = int16_to_float(self->data_arr[LRR][i]);
1787 baf[i] = int16_to_float(self->data_arr[BAF][i]);
1788 }
1789 }
1790
1791 if (model->lrr_gc_order > 0 && n > model->lrr_gc_order)
1792 adjust_lrr(lrr, model->gc_arr, n, self->vcf_imap_arr, self->stats.coeffs, model->lrr_gc_order);
1793 else if (model->lrr_gc_order != -1)
1794 adjust_lrr(lrr, model->gc_arr, n, self->vcf_imap_arr, self->stats.coeffs, 0);
1795
1796 int8_t *as = (int8_t *)calloc(n, sizeof(int8_t));
1797 int *pos = (int *)malloc(n * sizeof(int));
1798 for (int i = 0; i < n; i++) pos[i] = model->locus_arr[self->vcf_imap_arr[i]].pos;
1799 int *imap_arr = (int *)malloc(n * sizeof(int));
1800 int *hets_imap_arr = (int *)malloc(n * sizeof(int));
1801 float *pbaf_arr = (float *)malloc(n * sizeof(float));
1802
1803 int16_t *ldev = (int16_t *)calloc(n, sizeof(int16_t));
1804 int16_t *bdev = (int16_t *)calloc(n, sizeof(int16_t));
1805
1806 if (model->rid == model->genome_rules->x_rid && self->computed_gender == GENDER_MALE) {
1807 for (int i = 0; i < n; i++) {
1808 if (pos[i] > model->genome_rules->x_nonpar_beg && pos[i] < model->genome_rules->x_nonpar_end) {
1809 lrr[i] = NAN;
1810 baf[i] = NAN;
1811 }
1812 }
1813 }
1814
1815 if (model->cnp_itr) {
1816 while (regitr_overlap(model->cnp_itr)) {
1817 int a, b;
1818 if (get_cnp_edges(pos, n, model->cnp_itr->beg, model->cnp_itr->end, &a, &b) == 0) {
1819 int cnp_type = regitr_payload(model->cnp_itr, int);
1820 float exp_ldev = NAN;
1821 float exp_bdev = NAN;
1822 mocha.type = MOCHA_UNDET;
1823 mocha.ldev = get_median(lrr + a, b + 1 - a, NULL);
1824 if (mocha.ldev > 0 && (cnp_type == MOCHA_CNP_GAIN || cnp_type == MOCHA_CNP_CNV)) {
1825 if (model->flags & WGS_DATA)
1826 mocha.lod_lrr_baf =
1827 lrr_ad_lod(lrr + a, ad0 + a, ad1 + a, b + 1 - a, NULL, model->err_log_prb, model->lrr_bias,
1828 model->lrr_hap2dip, self->adjlrr_sd, self->stats.dispersion, 1.0f / 6.0f);
1829 else
1830 mocha.lod_lrr_baf =
1831 lrr_baf_lod(lrr + a, baf + a, b + 1 - a, NULL, model->err_log_prb, model->lrr_bias,
1832 model->lrr_hap2dip, self->adjlrr_sd, self->stats.dispersion, 1.0f / 6.0f);
1833 if (mocha.lod_lrr_baf
1834 > -(model->xy_major_log_prb + model->xy_minor_log_prb) / 2.0f * (float)M_LOG10E) {
1835 mocha.type = MOCHA_CNP_GAIN;
1836 mocha.cf = NAN;
1837 exp_ldev = log2f(1.5f) * model->lrr_hap2dip;
1838 exp_bdev = 1.0f / 6.0f;
1839 }
1840 } else if (mocha.ldev <= 0 && (cnp_type == MOCHA_CNP_LOSS || cnp_type == MOCHA_CNP_CNV)) {
1841 if (model->flags & WGS_DATA)
1842 mocha.lod_lrr_baf =
1843 lrr_ad_lod(lrr + a, ad0 + a, ad1 + a, b + 1 - a, NULL, model->err_log_prb, model->lrr_bias,
1844 model->lrr_hap2dip, self->adjlrr_sd, self->stats.dispersion, -0.5f);
1845 else
1846 mocha.lod_lrr_baf =
1847 lrr_baf_lod(lrr + a, baf + a, b + 1 - a, NULL, model->err_log_prb, model->lrr_bias,
1848 model->lrr_hap2dip, self->adjlrr_sd, self->stats.dispersion, -0.5f);
1849 if (mocha.lod_lrr_baf
1850 > -(model->xy_major_log_prb + model->xy_minor_log_prb) / 2.0f * (float)M_LOG10E) {
1851 mocha.type = MOCHA_CNP_LOSS;
1852 mocha.cf = NAN;
1853 exp_ldev = -model->lrr_hap2dip;
1854 exp_bdev = 0.5f;
1855 }
1856 }
1857 if (mocha.type == MOCHA_CNP_GAIN || mocha.type == MOCHA_CNP_LOSS) {
1858 if (model->flags & WGS_DATA) {
1859 if (cnp_edge_is_not_cn2_lrr_ad(lrr, ad0, ad1, n, a, b,
1860 (model->xy_major_log_prb + model->xy_minor_log_prb) / 2.0f,
1861 model->err_log_prb, model->lrr_bias, model->lrr_hap2dip,
1862 self->adjlrr_sd, self->stats.dispersion, exp_ldev, exp_bdev))
1863 continue;
1864 } else {
1865 if (cnp_edge_is_not_cn2_lrr_baf(lrr, baf, n, a, b,
1866 (model->xy_major_log_prb + model->xy_minor_log_prb) / 2.0f,
1867 model->err_log_prb, model->lrr_bias, model->lrr_hap2dip,
1868 self->adjlrr_sd, self->stats.dispersion, exp_ldev, exp_bdev))
1869 continue;
1870 }
1871 get_mocha_stats(pos, lrr, baf, gt_phase, n, a, b, cen_beg, cen_end, length, self->stats.baf_conc,
1872 &mocha);
1873 // compute bdev, if possible
1874 if (mocha.n_hets > 0) {
1875 mocha.bdev = model->flags & WGS_DATA
1876 ? get_ad_bdev(ad0 + a, ad1 + a, b + 1 - a, NULL, self->stats.dispersion)
1877 : get_baf_bdev(baf + a, b + 1 - a, NULL, self->stats.dispersion);
1878 } else {
1879 mocha.bdev = NAN;
1880 }
1881 mocha_table->n++;
1882 hts_expand(mocha_t, mocha_table->n, mocha_table->m, mocha_table->a);
1883 mocha_table->a[mocha_table->n - 1] = mocha;
1884 for (int j = a; j <= b; j++) {
1885 // TODO add other stuff here, like setting ldev
1886 // and bdev
1887 lrr[j] = NAN; // do not use the data again
1888 baf[j] = NAN; // do not use the data again
1889 }
1890 }
1891 }
1892 }
1893 }
1894
1895 float *hs_arr = NULL;
1896 int n_hs = 0, m_hs = 0;
1897 for (int hmm_model = 0; hmm_model < 2; hmm_model++) {
1898 // select data to use from the contig, depending on which HMM model is being
1899 // used
1900 int last_p = 0, first_q = 0;
1901 int n_imap = 0;
1902 for (int i = 0; i < n; i++)
1903 if ((hmm_model == LRR_BAF && !isnan(lrr[i])) || (hmm_model == BAF_PHASE && !isnan(baf[i]))) {
1904 if (pos[i] < cen_beg) last_p++;
1905 if (pos[i] < cen_end) first_q++;
1906 imap_arr[n_imap] = i;
1907 n_imap++;
1908 }
1909 if (n_imap == 0) continue;
1910
1911 // compute emission probabilities and Viterbi path according to HMM model
1912 int middle = 0;
1913 // TODO eliminate this redundancy
1914 if (hmm_model == LRR_BAF) {
1915 n_hs = model->bdev_lrr_baf_n;
1916 hts_expand(float, n_hs, m_hs, hs_arr);
1917 for (int i = 0; i < model->bdev_lrr_baf_n; i++) {
1918 hs_arr[i] = model->bdev_lrr_baf[i];
1919 if (model->bdev_lrr_baf[i] < 0.0f) middle++;
1920 }
1921 } else if (hmm_model == BAF_PHASE) {
1922 n_hs = model->bdev_baf_phase_n;
1923 hts_expand(float, n_hs, m_hs, hs_arr);
1924 for (int i = 0; i < model->bdev_baf_phase_n; i++) hs_arr[i] = model->bdev_baf_phase[i];
1925 }
1926 int8_t *path;
1927 float ret;
1928 int *beg = NULL, m_beg = 0, *end = NULL, m_end = 0, nseg;
1929 do {
1930 if (n_hs + (hmm_model == LRR_BAF ? n_hs : 0) > 50) error("Too many states being tested for the HMM\n");
1931
1932 float *emis_log_lkl;
1933 if (model->flags & WGS_DATA) {
1934 emis_log_lkl =
1935 hmm_model == LRR_BAF
1936 ? lrr_ad_emis_log_lkl(lrr, ad0, ad1, n_imap, imap_arr, model->err_log_prb, model->lrr_bias,
1937 model->lrr_hap2dip, self->adjlrr_sd, self->stats.dispersion, hs_arr, n_hs)
1938 : ad_phase_emis_log_lkl(ad0, ad1, gt_phase, n_imap, imap_arr, model->err_log_prb,
1939 self->stats.dispersion, hs_arr, n_hs);
1940 } else {
1941 emis_log_lkl = hmm_model == LRR_BAF
1942 ? lrr_baf_emis_log_lkl(lrr, baf, n_imap, imap_arr, model->err_log_prb,
1943 model->lrr_bias, model->lrr_hap2dip, self->adjlrr_sd,
1944 self->stats.dispersion, hs_arr, n_hs)
1945 : baf_phase_emis_log_lkl(baf, gt_phase, n_imap, imap_arr, model->err_log_prb,
1946 self->stats.dispersion, hs_arr, n_hs);
1947 }
1948 path = log_viterbi_run(emis_log_lkl, n_imap, n_hs + (hmm_model == LRR_BAF ? n_hs : 0),
1949 model->xy_major_log_prb, model->xy_minor_log_prb, tel_log_prb,
1950 hmm_model == LRR_BAF ? NAN : model->flip_log_prb, last_p, first_q);
1951 free(emis_log_lkl);
1952
1953 if (hmm_model == LRR_BAF)
1954 for (int i = 0; i < n_imap; i++)
1955 if (path[i] > n_hs) path[i] = 0;
1956 ret = get_path_segs(path, hs_arr, n_imap, hmm_model, middle, &beg, &m_beg, &end, &m_end, &nseg);
1957
1958 if (ret) // two consecutive hidden states were used, hinting that
1959 // testing of a middle state might be necessary
1960 {
1961 free(path);
1962 n_hs++;
1963 if (middle && ret < 0.0f) middle++;
1964 hts_expand(float, n_hs, m_hs, hs_arr);
1965 hs_arr[n_hs - 1] = ret;
1966 ks_introsort_float(n_hs, hs_arr);
1967 }
1968 } while (ret);
1969
1970 // loop through all the segments called by the Viterbi algorithm
1971 for (int i = 0; i < nseg; i++) {
1972 // compute edges of the call
1973 int a = imap_arr[beg[i]];
1974 if (beg[i] == 0)
1975 while (a > 0 && ldev[a - 1] == 0 && bdev[a - 1] == 0) a--; // extend call towards p telomere
1976 int b = imap_arr[end[i]];
1977 if (end[i] == n_imap - 1)
1978 while (b < n - 1 && ldev[b + 1] == 0 && bdev[b + 1] == 0) b++; // extend call towards q telomere
1979
1980 // if an autosomal call spans the whole chromosome and seems an isochromosome event, split it in two
1981 if (model->rid != model->genome_rules->x_rid && a == 0 && b == n - 1 && last_p > 0 && first_q < n_imap) {
1982 float p_arm_ldev = get_median(lrr + 0, imap_arr[last_p - 1] + 1, NULL);
1983 float q_arm_ldev = get_median(lrr + imap_arr[first_q], n - imap_arr[first_q], NULL);
1984 float left_arm_ldev_se = get_se_mean(lrr + 0, imap_arr[last_p - 1] + 1, NULL);
1985 float right_arm_ldev_se = get_se_mean(lrr + imap_arr[first_q], n - imap_arr[first_q], NULL);
1986 float z2 = sqf(p_arm_ldev - q_arm_ldev) / (sqf(left_arm_ldev_se) + sqf(right_arm_ldev_se));
1987 if (p_arm_ldev * q_arm_ldev < 0.0f && z2 > 8.0f * M_LN10) {
1988 end[i] = last_p - 1;
1989 b = imap_arr[end[i]];
1990 nseg++;
1991 hts_expand(int, nseg, m_beg, beg);
1992 hts_expand(int, nseg, m_end, end);
1993 beg[nseg - 1] = first_q;
1994 end[nseg - 1] = n_imap - 1;
1995 }
1996 }
1997
1998 mocha.ldev = get_median(lrr + a, b + 1 - a, NULL);
1999 get_mocha_stats(pos, lrr, baf, gt_phase, n, a, b, cen_beg, cen_end, length, self->stats.baf_conc, &mocha);
2000
2001 double bdev_lrr_baf;
2002 f3_data_t *f3_data = f3_pack(lrr, a, ad0, ad1, &mocha, model,
2003 self, baf);
2004 kmin_brent(f3, -0.15, 0.15, f3_data, KMIN_EPS, &bdev_lrr_baf);
2005 free(f3_data);
2006 if (model->flags & WGS_DATA)
2007 mocha.lod_lrr_baf =
2008 lrr_ad_lod(lrr + a, ad0 + a, ad1 + a, mocha.n_sites, NULL, model->err_log_prb, model->lrr_bias,
2009 model->lrr_hap2dip, self->adjlrr_sd, self->stats.dispersion, bdev_lrr_baf);
2010 else
2011 mocha.lod_lrr_baf =
2012 lrr_baf_lod(lrr + a, baf + a, mocha.n_sites, NULL, model->err_log_prb, model->lrr_bias,
2013 model->lrr_hap2dip, self->adjlrr_sd, self->stats.dispersion, bdev_lrr_baf);
2014
2015 if (hmm_model == LRR_BAF) {
2016 // here you need to check whether the call would have been
2017 // better with the phased HMM model
2018 int n_hets_imap = 0;
2019 for (int j = beg[i]; j <= end[i]; j++)
2020 if (!isnan(baf[imap_arr[j]])) {
2021 n_hets_imap++;
2022 hets_imap_arr[n_hets_imap - 1] = imap_arr[j];
2023 }
2024 // TODO here it needs to pass information about the centromeres
2025 if (model->flags & WGS_DATA) {
2026 mocha.lod_baf_phase =
2027 compare_wgs_models(ad0, ad1, gt_phase, n_hets_imap, hets_imap_arr,
2028 (model->xy_major_log_prb + model->xy_minor_log_prb) / 2.0f,
2029 model->err_log_prb, model->flip_log_prb, tel_log_prb, self->stats.dispersion,
2030 model->bdev_baf_phase, model->bdev_baf_phase_n);
2031 } else {
2032 mocha.lod_baf_phase =
2033 compare_models(baf, gt_phase, n_hets_imap, hets_imap_arr,
2034 (model->xy_major_log_prb + model->xy_minor_log_prb) / 2.0f, model->err_log_prb,
2035 model->flip_log_prb, tel_log_prb, self->stats.dispersion, model->bdev_baf_phase,
2036 model->bdev_baf_phase_n);
2037 }
2038 if (mocha.lod_baf_phase > 0.8 * mocha.lod_lrr_baf) continue;
2039
2040 // compute bdev, if possible
2041 if (n_hets_imap > 0) {
2042 mocha.bdev = model->flags & WGS_DATA
2043 ? get_ad_bdev(ad0, ad1, n_hets_imap, hets_imap_arr, self->stats.dispersion)
2044 : get_baf_bdev(baf, n_hets_imap, hets_imap_arr, self->stats.dispersion);
2045 } else {
2046 mocha.bdev = NAN;
2047 }
2048 mocha.bdev_se = NAN;
2049 for (int j = 0; j < n_hets_imap; j++) as[hets_imap_arr[j]] = (int8_t)SIGN(baf[hets_imap_arr[j]] - 0.5f);
2050 } else {
2051 // penalizes the LOD by the number of phase flips
2052 mocha.n_flips = 0;
2053 for (int j = beg[i]; j < end[i]; j++)
2054 if (path[j] != path[j + 1]) mocha.n_flips++;
2055
2056 if (model->flags & WGS_DATA) {
2057 double bdev;
2058 f6_data_t *f6_data = f6_pack(ad0, ad1, gt_phase, &mocha,
2059 imap_arr, beg, i, path, self);
2060 kmin_brent(f6, 0.1, 0.2, f6_data, KMIN_EPS, &bdev);
2061 free(f6_data);
2062 mocha.bdev = fabsf((float)bdev);
2063 mocha.lod_baf_phase =
2064 ad_phase_lod(ad0, ad1, gt_phase, mocha.n_hets, imap_arr + beg[i], path + beg[i],
2065 model->err_log_prb, self->stats.dispersion, mocha.bdev);
2066 } else {
2067 double bdev;
2068 f8_data_t *f8_data = f8_pack(baf, gt_phase, &mocha,
2069 imap_arr, beg, i, path, self);
2070 kmin_brent(f8, 0.1, 0.2, f8_data, KMIN_EPS, &bdev);
2071 free(f8_data);
2072 mocha.bdev = fabsf((float)bdev);
2073 mocha.lod_baf_phase = baf_phase_lod(baf, gt_phase, mocha.n_hets, imap_arr + beg[i], path + beg[i],
2074 model->err_log_prb, self->stats.dispersion, mocha.bdev);
2075 // for larger bdev estimates, a model with phase might underestimate the deviation due to switch
2076 // errors
2077 if (mocha.bdev > self->stats.dispersion) {
2078 float bdev = get_baf_bdev(baf, mocha.n_hets, imap_arr + beg[i], self->stats.dispersion);
2079 if (!isnan(bdev)) mocha.bdev = bdev;
2080 }
2081 }
2082 mocha.lod_baf_phase += (float)mocha.n_flips * model->flip_log_prb * (float)M_LOG10E;
2083
2084 for (int j = 0; j < mocha.n_hets; j++)
2085 pbaf_arr[j] = (baf[imap_arr[beg[i] + j]] - 0.5f) * (float)SIGN(path[beg[i] + j]);
2086 mocha.bdev_se = get_se_mean(pbaf_arr, mocha.n_hets, NULL);
2087 for (int j = beg[i]; j <= end[i]; j++) as[imap_arr[j]] = (int8_t)SIGN(path[j]) * gt_phase[imap_arr[j]];
2088 }
2089
2090 mocha.type = mocha_type(mocha.ldev, mocha.ldev_se, mocha.bdev, mocha.bdev_se, mocha.n_hets,
2091 model->lrr_hap2dip, mocha.p_arm, mocha.q_arm);
2092 mocha.cf = mocha_cell_fraction(mocha.ldev, mocha.ldev_se, mocha.bdev, mocha.n_hets, mocha.type,
2093 model->lrr_hap2dip);
2094 mocha_table->n++;
2095 hts_expand(mocha_t, mocha_table->n, mocha_table->m, mocha_table->a);
2096 mocha_table->a[mocha_table->n - 1] = mocha;
2097
2098 // update information that will be stored in the output VCF and make
2099 // remaining sites missing
2100 for (int j = a; j <= b; j++) {
2101 if (ldev[j] == 0) ldev[j] = float_to_int16(mocha.ldev);
2102 if (bdev[j] == 0) bdev[j] = float_to_int16(mocha.bdev);
2103 lrr[j] = NAN; // do not use the data again
2104 baf[j] = NAN; // do not use the data again
2105 }
2106 }
2107 free(path);
2108 free(beg);
2109 free(end);
2110 }
2111
2112 // clean up
2113 free(hs_arr);
2114 free(imap_arr);
2115 free(hets_imap_arr);
2116 free(pbaf_arr);
2117 free(pos);
2118 free(lrr);
2119 free(baf);
2120 memcpy(self->data_arr[LDEV], ldev, n * sizeof(int16_t));
2121 memcpy(self->data_arr[BDEV], bdev, n * sizeof(int16_t));
2122 memcpy(self->phase_arr, as, n * sizeof(int8_t));
2123 free(ldev);
2124 free(bdev);
2125 free(as);
2126 }
2127
2128 // computes the medoid contig for LRR regression
2129 // TODO weight the coefficients appropriately
get_medoid(const float * coeffs,int n,int order)2130 static int get_medoid(const float *coeffs, int n, int order) {
2131 int medoid_idx = -1;
2132 float prev = INFINITY;
2133 for (int i = 0; i < n; i++) {
2134 float next = 0.0f;
2135 for (int j = 0; j < n; j++) {
2136 if (i != j) {
2137 for (int k = 0; k <= order; k++) {
2138 next += fabsf(coeffs[i * (order + 1) + k] - coeffs[j * (order + 1) + k]);
2139 }
2140 }
2141 }
2142 if (next < prev) {
2143 prev = next;
2144 medoid_idx = i;
2145 }
2146 }
2147 return medoid_idx;
2148 }
2149
2150 // groups numbers in two separate distributions
get_lrr_cutoff(const float * v,int n)2151 static float get_lrr_cutoff(const float *v, int n) {
2152 if (n <= 1) return NAN;
2153 float *w = (float *)malloc(n * sizeof(float));
2154 int j = 0;
2155 for (int i = 0; i < n; i++) {
2156 if (!isnan(v[i])) w[j++] = v[i];
2157 }
2158 if (j <= 1) {
2159 free(w);
2160 return NAN;
2161 }
2162 ks_introsort_float((size_t)j, w);
2163
2164 // identify a reasonable initial split allowing for some outliers
2165 int k = j / 2;
2166 int d = (int)sqrtf((float)j) - 1;
2167 while (k > 1 && w[k - 1] - w[d] > w[j - 1 - d] - w[k - 1]) k--;
2168 while (k < j && w[k] - w[d] < w[j - 1 - d] - w[k]) k++;
2169
2170 // run k-means clustering EM
2171 while (k > 0
2172 && w[k - 1] - (w[(k - 1) / 2] + w[k / 2]) * 0.5f > (w[(j + k - 1) / 2] + w[(j + k) / 2]) * 0.5f - w[k - 1])
2173 k--;
2174 while (k < j && w[k] - (w[(k - 1) / 2] + w[k / 2]) * 0.5f < (w[(j + k - 1) / 2] + w[(j + k) / 2]) * 0.5f - w[k])
2175 k++;
2176
2177 float cutoff = (k > 0 && k < j) ? (w[k - 1] + w[k]) * 0.5f : NAN;
2178 free(w);
2179 return cutoff;
2180 }
2181
2182 typedef struct
2183 {
2184 int16_t *ad0;
2185 int16_t *ad1;
2186 int n;
2187 } f2_data_t;
2188
f2(double x,void * data)2189 double f2(double x, void *data)
2190 {
2191 f2_data_t *d = data;
2192
2193 return -lod_lkl_beta_binomial(d->ad0, d->ad1, d->n, NULL, x);
2194 }
2195
f2_pack(int16_t * ad0,int16_t * ad1,int n)2196 static inline f2_data_t *f2_pack(int16_t *ad0, int16_t *ad1, int n)
2197
2198 {
2199 f2_data_t *d = malloc(sizeof(f2_data_t));
2200
2201 d->ad0 = ad0;
2202 d->ad1 = ad1;
2203 d->n = n;
2204
2205 return d;
2206 }
2207
2208 typedef struct
2209 {
2210 int16_t *ad0;
2211 int16_t *ad1;
2212 int n_imap;
2213 int *imap_arr;
2214 } f9_data_t;
2215
f9(double x,void * data)2216 double f9(double x, void *data)
2217 {
2218 f9_data_t *d = data;
2219
2220 return -lod_lkl_beta_binomial(d->ad0, d->ad1, d->n_imap, d->imap_arr, x);
2221 }
2222
f9_pack(int16_t * ad0,int16_t * ad1,int n_imap,int * imap_arr)2223 static inline f9_data_t *f9_pack(int16_t *ad0, int16_t *ad1, int n_imap,
2224 int *imap_arr)
2225 {
2226 f9_data_t *d = malloc(sizeof(f9_data_t));
2227
2228 d->ad0 = ad0;
2229 d->ad1 = ad1;
2230 d->n_imap = n_imap;
2231 d->imap_arr = imap_arr;
2232
2233 return d;
2234 }
2235
2236 // this function computes several contig stats and then clears the contig data from the sample
sample_stats(sample_t * self,const model_t * model)2237 static void sample_stats(sample_t *self, const model_t *model) {
2238 int n = self->n;
2239 if (n == 0) return;
2240 self->n_sites += n;
2241 for (int i = 0; i < n; i++)
2242 if (self->phase_arr[i] == bcf_int8_missing) self->n_missing_gts++;
2243
2244 int16_t *ad0 = self->data_arr[AD0];
2245 int16_t *ad1 = self->data_arr[AD1];
2246 float *lrr = (float *)malloc(n * sizeof(float));
2247 float *baf = (float *)malloc(n * sizeof(float));
2248 if (model->flags & WGS_DATA) {
2249 ad_to_lrr_baf(ad0, ad1, lrr, baf, n);
2250 } else {
2251 for (int i = 0; i < n; i++) {
2252 lrr[i] = int16_to_float(self->data_arr[LRR][i]);
2253 baf[i] = int16_to_float(self->data_arr[BAF][i]);
2254 }
2255 }
2256 int *imap_arr = (int *)malloc(n * sizeof(int));
2257
2258 if (model->rid == model->genome_rules->x_rid) {
2259 int n_imap = 0;
2260 for (int i = 0; i < n; i++) {
2261 if (!isnan(baf[i])) self->n_hets++;
2262 int pos = model->locus_arr[self->vcf_imap_arr[i]].pos;
2263 if (pos > model->genome_rules->x_nonpar_beg && pos < model->genome_rules->x_nonpar_end
2264 && (pos < model->genome_rules->x_xtr_beg || pos > model->genome_rules->x_xtr_end)) {
2265 if (!isnan(baf[i])) self->x_nonpar_n_hets++;
2266 n_imap++;
2267 imap_arr[n_imap - 1] = i;
2268 } else if (!isnan(baf[i])) {
2269 if (pos <= model->genome_rules->x_nonpar_beg) {
2270 self->par1_n_hets++;
2271 } else if (pos >= model->genome_rules->x_xtr_beg || pos <= model->genome_rules->x_xtr_end) {
2272 self->xtr_n_hets++;
2273 } else if (pos >= model->genome_rules->x_nonpar_end) {
2274 self->par2_n_hets++;
2275 }
2276 }
2277 }
2278 self->x_nonpar_lrr_median = get_median(lrr, n_imap, imap_arr);
2279
2280 if (model->flags & WGS_DATA) {
2281 f9_data_t *f9_data = f9_pack(ad0, ad1, n_imap, imap_arr);
2282 double x;
2283 kmin_brent(f9, 0.1, 0.2, f9_data, KMIN_EPS, &x); // dispersions above 0.5 are not allowed
2284 free(f9_data);
2285 self->x_nonpar_dispersion = (float)x;
2286 } else {
2287 self->x_nonpar_dispersion = get_sample_sd(baf, n_imap, imap_arr);
2288 }
2289 } else if (model->rid == model->genome_rules->y_rid) {
2290 int n_imap = 0;
2291 for (int i = 0; i < n; i++) {
2292 if (!isnan(baf[i])) self->n_hets++;
2293 int pos = model->locus_arr[self->vcf_imap_arr[i]].pos;
2294 if (pos > model->genome_rules->y_nonpar_beg && pos < model->genome_rules->y_nonpar_end
2295 && (pos < model->genome_rules->y_xtr_beg || pos > model->genome_rules->y_xtr_end)) {
2296 n_imap++;
2297 imap_arr[n_imap - 1] = i;
2298 }
2299 }
2300 self->y_nonpar_lrr_median = get_median(lrr, n_imap, imap_arr);
2301 } else if (model->rid == model->genome_rules->mt_rid) {
2302 self->mt_lrr_median = get_median(lrr, n, NULL);
2303 } else {
2304 // expand arrays if necessary
2305 self->n_stats++;
2306 hts_expand(stats_t, self->n_stats, self->m_stats, self->stats_arr);
2307
2308 if (model->flags & WGS_DATA) {
2309 double x;
2310 f2_data_t *f2_data = f2_pack(ad0, ad1, n);
2311 kmin_brent(f2, 0.1, 0.2, f2_data, KMIN_EPS, &x); // dispersions above 0.5 are not allowed
2312 free(f2_data);
2313 self->stats_arr[self->n_stats - 1].dispersion = (float)x;
2314 } else {
2315 self->stats_arr[self->n_stats - 1].dispersion = get_sample_sd(baf, n, NULL);
2316 }
2317 for (int i = 0; i < n; i++)
2318 if (!isnan(baf[i])) self->n_hets++;
2319 self->stats_arr[self->n_stats - 1].lrr_median = get_median(lrr, n, NULL);
2320 self->stats_arr[self->n_stats - 1].lrr_sd = get_sample_sd(lrr, n, NULL);
2321
2322 int conc, disc;
2323 get_baf_conc(baf, self->phase_arr, n, &conc, &disc);
2324 self->stats_arr[self->n_stats - 1].baf_conc = (float)conc / (float)(conc + disc);
2325 self->stats_arr[self->n_stats - 1].baf_auto = get_baf_auto_corr(baf, self->phase_arr, n);
2326 // performs polynomial regression for LRR
2327 if (model->lrr_gc_order > 0 && n > model->lrr_gc_order) {
2328 float tss = get_tss(lrr, n);
2329 int ret = polyfit(lrr, model->gc_arr, n, self->vcf_imap_arr, model->lrr_gc_order,
2330 self->stats_arr[self->n_stats - 1].coeffs);
2331 if (ret < 0) error("Polynomial regression failed\n");
2332 adjust_lrr(lrr, model->gc_arr, n, self->vcf_imap_arr, self->stats_arr[self->n_stats - 1].coeffs,
2333 model->lrr_gc_order);
2334 self->stats_arr[self->n_stats - 1].coeffs[0] += get_median(lrr, n, NULL); // further adjusts by median
2335 float rss = get_tss(lrr, n);
2336 self->stats_arr[self->n_stats - 1].lrr_gc_rel_ess = 1.0f - rss / tss;
2337 } else if (model->lrr_gc_order != -1) {
2338 self->stats_arr[self->n_stats - 1].coeffs[0] = get_median(lrr, n, NULL);
2339 self->stats_arr[self->n_stats - 1].lrr_gc_rel_ess = NAN;
2340 }
2341 // compute autocorrelation after GC correction
2342 self->stats_arr[self->n_stats - 1].lrr_auto = get_lrr_auto_corr(lrr, n, NULL);
2343 }
2344
2345 free(lrr);
2346 free(baf);
2347 free(imap_arr);
2348 }
2349
2350 // this function computes the median of contig stats
sample_summary(sample_t * self,int n,model_t * model,int compute_gender)2351 static void sample_summary(sample_t *self, int n, model_t *model, int compute_gender) {
2352 float *tmp_arr = (float *)malloc(n * sizeof(float));
2353 int m_tmp = n;
2354
2355 for (int i = 0; i < n; i++) {
2356 hts_expand(float, self[i].n_stats *(model->lrr_gc_order < 0 ? 1 : model->lrr_gc_order + 1), m_tmp, tmp_arr);
2357
2358 for (int j = 0; j < self[i].n_stats; j++) tmp_arr[j] = self[i].stats_arr[j].lrr_median;
2359 self[i].stats.lrr_median = get_median(tmp_arr, self[i].n_stats, NULL);
2360 for (int j = 0; j < self[i].n_stats; j++) tmp_arr[j] = self[i].stats_arr[j].lrr_sd;
2361 self[i].stats.lrr_sd = get_median(tmp_arr, self[i].n_stats, NULL);
2362 for (int j = 0; j < self[i].n_stats; j++) tmp_arr[j] = self[i].stats_arr[j].lrr_auto;
2363 self[i].stats.lrr_auto = get_median(tmp_arr, self[i].n_stats, NULL);
2364
2365 for (int j = 0; j < self[i].n_stats; j++) tmp_arr[j] = self[i].stats_arr[j].dispersion;
2366 self[i].stats.dispersion = get_median(tmp_arr, self[i].n_stats, NULL);
2367 for (int j = 0; j < self[i].n_stats; j++) tmp_arr[j] = self[i].stats_arr[j].baf_conc;
2368 self[i].stats.baf_conc = get_median(tmp_arr, self[i].n_stats, NULL);
2369 for (int j = 0; j < self[i].n_stats; j++) tmp_arr[j] = self[i].stats_arr[j].baf_auto;
2370 self[i].stats.baf_auto = get_median(tmp_arr, self[i].n_stats, NULL);
2371
2372 self[i].adjlrr_sd = self[i].stats.lrr_sd;
2373 if (model->lrr_gc_order == 0) {
2374 for (int j = 0; j < self[i].n_stats; j++) tmp_arr[j] = self[i].stats_arr[j].coeffs[0];
2375 self[i].stats.coeffs[0] = get_median(tmp_arr, self[i].n_stats, NULL);
2376 } else if (model->lrr_gc_order > 0 && self[i].n_stats > 0) {
2377 for (int j = 0; j < self[i].n_stats; j++)
2378 for (int k = 0; k <= model->lrr_gc_order; k++)
2379 tmp_arr[j * (model->lrr_gc_order + 1) + k] = self[i].stats_arr[j].coeffs[k];
2380 int medoid_idx = get_medoid(tmp_arr, self[i].n_stats, model->lrr_gc_order);
2381 for (int k = 0; k <= model->lrr_gc_order; k++)
2382 self[i].stats.coeffs[k] = tmp_arr[medoid_idx * (model->lrr_gc_order + 1) + k];
2383 for (int j = 0; j < self[i].n_stats; j++) tmp_arr[j] = self[i].stats_arr[j].lrr_gc_rel_ess;
2384 self[i].stats.lrr_gc_rel_ess = get_median(tmp_arr, self[i].n_stats, NULL);
2385 self[i].adjlrr_sd *= sqrtf(1.0f - self[i].stats.lrr_gc_rel_ess); // not perfect, but good enough(?)
2386 }
2387 free(self[i].stats_arr);
2388 }
2389
2390 if (compute_gender) {
2391 if (model->flags & WGS_DATA) {
2392 if (isnan(model->lrr_cutoff)) model->lrr_cutoff = -0.3f; // arbitrary cutoff between -M_LN2 and 0
2393 }
2394
2395 // determine LRR cutoff between haploid and diploid
2396 if (isnan(model->lrr_cutoff)) {
2397 int j = 0;
2398 for (int i = 0; i < n; i++)
2399 if (!isnan(self[i].x_nonpar_lrr_median))
2400 tmp_arr[j++] = isnan(self[i].stats.lrr_median)
2401 ? self[i].x_nonpar_lrr_median
2402 : self[i].x_nonpar_lrr_median - self[i].stats.lrr_median;
2403 model->lrr_cutoff = get_lrr_cutoff(tmp_arr, j);
2404 }
2405
2406 // determine gender of samples
2407 for (int i = 0; i < n; i++) {
2408 float tmp = isnan(self[i].stats.lrr_median) ? self[i].x_nonpar_lrr_median
2409 : self[i].x_nonpar_lrr_median - self[i].stats.lrr_median;
2410 if (tmp < model->lrr_cutoff)
2411 self[i].computed_gender = GENDER_MALE;
2412 else if (tmp > model->lrr_cutoff)
2413 self[i].computed_gender = GENDER_FEMALE;
2414 }
2415 }
2416
2417 free(tmp_arr);
2418 }
2419
mocha_print_stats(FILE * restrict stream,const sample_t * self,int n,int lrr_gc_order,const bcf_hdr_t * hdr,int flags)2420 static void mocha_print_stats(FILE *restrict stream, const sample_t *self, int n, int lrr_gc_order,
2421 const bcf_hdr_t *hdr, int flags) {
2422 if (stream == NULL) return;
2423 char gender[3];
2424 gender[GENDER_UNKNOWN] = 'U';
2425 gender[GENDER_MALE] = 'M';
2426 gender[GENDER_FEMALE] = 'F';
2427 fputs("sample_id", stream);
2428 fputs("\tcomputed_gender", stream);
2429 fputs("\tcall_rate", stream);
2430 fputs(flags & WGS_DATA ? "\tcov_median" : "\tlrr_median", stream);
2431 fputs(flags & WGS_DATA ? "\tcov_sd" : "\tlrr_sd", stream);
2432 fputs(flags & WGS_DATA ? "\tcov_auto" : "\tlrr_auto", stream);
2433 fputs(flags & WGS_DATA ? "\tbaf_corr" : "\tbaf_sd", stream);
2434 fputs("\tbaf_conc", stream);
2435 fputs("\tbaf_auto", stream);
2436 fputs("\tn_sites", stream);
2437 fputs("\tn_hets", stream);
2438 fputs("\tx_nonpar_n_hets", stream);
2439 fputs("\tpar1_n_hets", stream);
2440 fputs("\txtr_n_hets", stream);
2441 fputs("\tpar2_n_hets", stream);
2442 fputs("\tx_nonpar_baf_corr", stream);
2443 fputs(flags & WGS_DATA ? "\tx_nonpar_cov_median" : "\tx_nonpar_lrr_median", stream);
2444 fputs(flags & WGS_DATA ? "\ty_nonpar_cov_median" : "\ty_nonpar_lrr_median", stream);
2445 fputs(flags & WGS_DATA ? "\tmt_cov_median" : "\tmt_lrr_median", stream);
2446 fputs("\tlrr_gc_rel_ess", stream);
2447 for (int j = 0; j <= lrr_gc_order; j++) fprintf(stream, "\tlrr_gc_%d", j);
2448 fputc('\n', stream);
2449 for (int i = 0; i < n; i++) {
2450 fputs(bcf_hdr_int2id(hdr, BCF_DT_SAMPLE, self[i].idx), stream);
2451 fprintf(stream, "\t%c", gender[self[i].computed_gender]);
2452 fprintf(stream, "\t%.4f",
2453 self[i].stats.call_rate ? self[i].stats.call_rate
2454 : 1.0 - (float)self[i].n_missing_gts / (float)self[i].n_sites);
2455 fprintf(stream, "\t%.4f", flags & WGS_DATA ? expf(self[i].stats.lrr_median) : self[i].stats.lrr_median);
2456 fprintf(stream, "\t%.4f",
2457 flags & WGS_DATA ? expf(self[i].stats.lrr_median) * self[i].stats.lrr_sd : self[i].stats.lrr_sd);
2458 fprintf(stream, "\t%.4f", self[i].stats.lrr_auto);
2459 fprintf(stream, "\t%.4f", self[i].stats.dispersion);
2460 fprintf(stream, "\t%.4f", self[i].stats.baf_conc);
2461 fprintf(stream, "\t%.4f", self[i].stats.baf_auto);
2462 fprintf(stream, "\t%d", self[i].n_sites);
2463 fprintf(stream, "\t%d", self[i].n_hets);
2464 fprintf(stream, "\t%d", self[i].x_nonpar_n_hets);
2465 fprintf(stream, "\t%d", self[i].par1_n_hets);
2466 fprintf(stream, "\t%d", self[i].xtr_n_hets);
2467 fprintf(stream, "\t%d", self[i].par2_n_hets);
2468 fprintf(stream, "\t%.4f", self[i].x_nonpar_dispersion);
2469 fprintf(stream, "\t%.4f", flags & WGS_DATA ? expf(self[i].x_nonpar_lrr_median) : self[i].x_nonpar_lrr_median);
2470 fprintf(stream, "\t%.4f", flags & WGS_DATA ? expf(self[i].y_nonpar_lrr_median) : self[i].y_nonpar_lrr_median);
2471 fprintf(stream, "\t%.4f", flags & WGS_DATA ? expf(self[i].mt_lrr_median) : self[i].mt_lrr_median);
2472 fprintf(stream, "\t%.4f", self[i].stats.lrr_gc_rel_ess);
2473 for (int j = 0; j <= lrr_gc_order; j++) fprintf(stream, "\t%.4f", self[i].stats.coeffs[j]);
2474 fputc('\n', stream);
2475 }
2476 if (stream != stdout && stream != stderr) fclose(stream);
2477 }
2478
2479 /*********************************
2480 * VCF READ AND WRITE METHODS *
2481 *********************************/
2482
2483 // moves synced bcf reader to first useful line for reader0
bcf_sr_next_line_reader0(bcf_srs_t * sr)2484 static inline int bcf_sr_next_line_reader0(bcf_srs_t *sr) {
2485 int nret = bcf_sr_next_line(sr);
2486 while (nret > 0 && !bcf_sr_has_line(sr, 0)) nret = bcf_sr_next_line(sr);
2487 return nret;
2488 }
2489
2490 // write one contig
put_contig(bcf_srs_t * sr,const sample_t * sample,const model_t * model,htsFile * out_fh,bcf_hdr_t * out_hdr)2491 static int put_contig(bcf_srs_t *sr, const sample_t *sample, const model_t *model, htsFile *out_fh,
2492 bcf_hdr_t *out_hdr) {
2493 int rid = model->rid;
2494 bcf_hdr_t *hdr = bcf_sr_get_header(sr, 0);
2495 bcf_sr_seek(sr, bcf_hdr_id2name(hdr, rid), 0);
2496 int nsmpl = bcf_hdr_nsamples(out_hdr);
2497
2498 int *synced_iter = (int *)calloc(nsmpl, sizeof(int));
2499 float *bdev = (float *)calloc(nsmpl, sizeof(float));
2500 float *ldev = (float *)calloc(nsmpl, sizeof(float));
2501 int *as = (int *)calloc(nsmpl, sizeof(int));
2502
2503 int i;
2504 for (i = 0; bcf_sr_next_line_reader0(sr); i++) {
2505 bcf1_t *line = bcf_sr_get_line(sr, 0);
2506 if (rid != line->rid) break;
2507
2508 for (int j = 0; j < nsmpl; j++) {
2509 while (synced_iter[j] < sample[j].n - 1 && sample[j].vcf_imap_arr[synced_iter[j]] < i) synced_iter[j]++;
2510 if (sample[j].vcf_imap_arr[synced_iter[j]] == i) {
2511 if (sample[j].data_arr[LDEV])
2512 ldev[sample[j].idx] = int16_to_float(sample[j].data_arr[LDEV][synced_iter[j]]);
2513 if (sample[j].data_arr[BDEV])
2514 bdev[sample[j].idx] = int16_to_float(sample[j].data_arr[BDEV][synced_iter[j]]);
2515 if (sample[j].phase_arr) as[sample[j].idx] = sample[j].phase_arr[synced_iter[j]];
2516 } else {
2517 // if no match variant found, match the end of the contig or keep conservative
2518 if (i == 0 && sample[j].data_arr[BDEV])
2519 bdev[sample[j].idx] = int16_to_float(sample[j].data_arr[BDEV][0]);
2520 if (i == 0 && sample[j].data_arr[LDEV])
2521 ldev[sample[j].idx] = int16_to_float(sample[j].data_arr[LDEV][0]);
2522 if (sample[j].data_arr[BDEV] && int16_to_float(sample[j].data_arr[BDEV][synced_iter[j]]) == 0.0f)
2523 bdev[sample[j].idx] = 0.0f;
2524 if (sample[j].data_arr[LDEV] && int16_to_float(sample[j].data_arr[LDEV][synced_iter[j]]) == 0.0f)
2525 ldev[sample[j].idx] = 0.0f;
2526 if (sample[j].phase_arr) as[sample[j].idx] = 0;
2527 }
2528 }
2529 if (!(model->flags & WGS_DATA)) {
2530 bcf_update_info_float(out_hdr, line, "ADJ_COEFF", model->locus_arr[i].adjust, 9);
2531 }
2532 if (!(model->flags & NO_ANNOT)) {
2533 bcf_update_format_float(out_hdr, line, "Ldev", ldev, (int)nsmpl);
2534 bcf_update_format_float(out_hdr, line, "Bdev", bdev, (int)nsmpl);
2535 }
2536 bcf_update_format_int32(out_hdr, line, "AS", as, (int)nsmpl);
2537
2538 if (bcf_write(out_fh, out_hdr, line) < 0) error("Unable to write to output VCF file\n");
2539 }
2540
2541 free(synced_iter);
2542 free(ldev);
2543 free(bdev);
2544 free(as);
2545
2546 return i;
2547 }
2548
2549 // write header
print_hdr(htsFile * out_fh,bcf_hdr_t * hdr,int argc,char * argv[],int record_cmd_line,int flags)2550 static bcf_hdr_t *print_hdr(htsFile *out_fh, bcf_hdr_t *hdr, int argc, char *argv[], int record_cmd_line, int flags) {
2551 bcf_hdr_t *out_hdr = bcf_hdr_dup(hdr);
2552 int adj_coeff_id = bcf_hdr_id2int(hdr, BCF_DT_ID, "ADJ_COEFF");
2553 if (!(flags & WGS_DATA) && !bcf_hdr_idinfo_exists(hdr, BCF_HL_INFO, adj_coeff_id))
2554 bcf_hdr_append(out_hdr,
2555 "##INFO=<ID=ADJ_COEFF,Number=9,Type=Float,Description=\"Adjust coefficients "
2556 "(order=AA_BAF0,AA_BAF1,AA_LRR0,AB_BAF0,AB_BAF1,AB_LRR0,BB_BAF0,BB_BAF1,BB_LRR0)"
2557 "\">");
2558 if (!(flags & NO_ANNOT)) {
2559 int ldev_id = bcf_hdr_id2int(hdr, BCF_DT_ID, "Ldev");
2560 if (!bcf_hdr_idinfo_exists(hdr, BCF_HL_FMT, ldev_id))
2561 bcf_hdr_append(out_hdr,
2562 "##FORMAT=<ID=Ldev,Number=1,Type=Float,Description=\"LRR deviation "
2563 "due to chromosomal alteration\">");
2564
2565 int bdev_id = bcf_hdr_id2int(hdr, BCF_DT_ID, "Bdev");
2566 if (!bcf_hdr_idinfo_exists(hdr, BCF_HL_FMT, bdev_id))
2567 bcf_hdr_append(out_hdr,
2568 "##FORMAT=<ID=Bdev,Number=1,Type=Float,Description=\"BAF deviation "
2569 "due to chromosomal alteration\">");
2570 }
2571 int as_id = bcf_hdr_id2int(hdr, BCF_DT_ID, "AS");
2572 if (!bcf_hdr_idinfo_exists(hdr, BCF_HL_FMT, as_id))
2573 bcf_hdr_append(out_hdr,
2574 "##FORMAT=<ID=AS,Number=1,Type=Integer,Description=\"Allelic "
2575 "shift (1/-1 if the alternate allele is over/under represented)\">");
2576 if (record_cmd_line) bcf_hdr_append_version(out_hdr, argc, argv, "bcftools_plugin");
2577 if (bcf_hdr_write(out_fh, out_hdr) < 0) error("Unable to write to output VCF file\n");
2578 return out_hdr;
2579 }
2580
2581 // retrieve genotypes as NC, AA, AB, BB
2582 // assumes little endian architecture
bcf_get_ab_genotypes(bcf_fmt_t * fmt,int8_t * gts,int nsmpl,int allele_a,int allele_b)2583 static int bcf_get_ab_genotypes(bcf_fmt_t *fmt, int8_t *gts, int nsmpl, int allele_a, int allele_b) {
2584 if (!fmt || fmt->n != 2) return 0;
2585
2586 #define BRANCH(type_t, bcf_type_vector_end) \
2587 { \
2588 type_t *p = (type_t *)fmt->p; \
2589 for (int i = 0; i < nsmpl; i++, p += 2) { \
2590 if (p[0] == bcf_type_vector_end || bcf_gt_is_missing(p[0]) || p[1] == bcf_type_vector_end \
2591 || bcf_gt_is_missing(p[1])) { \
2592 gts[i] = GT_NC; \
2593 } else { \
2594 type_t allele_0 = bcf_gt_allele(p[0]); \
2595 type_t allele_1 = bcf_gt_allele(p[1]); \
2596 if (allele_0 == allele_a && allele_1 == allele_a) \
2597 gts[i] = GT_AA; \
2598 else if (allele_0 == allele_b && allele_1 == allele_b) \
2599 gts[i] = GT_BB; \
2600 else if ((allele_0 == allele_a && allele_1 == allele_b) \
2601 || (allele_0 == allele_b && allele_1 == allele_a)) \
2602 gts[i] = GT_AB; \
2603 else \
2604 return -1; \
2605 } \
2606 } \
2607 }
2608 switch (fmt->type) {
2609 case BCF_BT_INT8:
2610 BRANCH(int8_t, bcf_int8_vector_end);
2611 break;
2612 case BCF_BT_INT16:
2613 BRANCH(int16_t, bcf_int16_vector_end);
2614 break;
2615 case BCF_BT_INT32:
2616 BRANCH(int32_t, bcf_int32_vector_end);
2617 break;
2618 default:
2619 error("Unexpected type %d\n", fmt->type);
2620 }
2621 #undef BRANCH
2622
2623 return 1;
2624 }
2625
2626 // read one contig
get_contig(bcf_srs_t * sr,sample_t * sample,model_t * model)2627 static int get_contig(bcf_srs_t *sr, sample_t *sample, model_t *model) {
2628 int rid = model->rid;
2629 bcf_hdr_t *hdr = bcf_sr_get_header(sr, 0);
2630 bcf_sr_seek(sr, bcf_hdr_id2name(hdr, rid), 0);
2631
2632 bcf_fmt_t *baf_fmt = NULL, *lrr_fmt = NULL;
2633 bcf_info_t *info;
2634 int nsmpl = bcf_hdr_nsamples(hdr);
2635
2636 model->n = 0;
2637 model->n_flipped = 0;
2638 for (int j = 0; j < nsmpl; j++) sample[j].n = 0;
2639
2640 if (!(model->flags & USE_NO_RULES_CHRS) && model->genome_rules->cen_beg[rid] == 0
2641 && model->genome_rules->cen_end[rid] == 0 && rid != model->genome_rules->mt_rid)
2642 return 0;
2643
2644 int8_t *gts = (int8_t *)malloc(nsmpl * sizeof(int8_t));
2645 int8_t *phase_arr = (int8_t *)malloc(nsmpl * sizeof(int8_t));
2646 int *imap_arr = (int *)malloc(nsmpl * sizeof(int));
2647 int16_t *gt0 = (int16_t *)malloc(nsmpl * sizeof(int16_t));
2648 int16_t *gt1 = (int16_t *)malloc(nsmpl * sizeof(int16_t));
2649 int16_t *ad0 = (int16_t *)malloc(nsmpl * sizeof(int16_t));
2650 int16_t *ad1 = (int16_t *)malloc(nsmpl * sizeof(int16_t));
2651 int *last_het_pos = (int *)calloc(nsmpl, sizeof(int));
2652 int *last_pos = (int *)calloc(nsmpl, sizeof(int));
2653
2654 int i;
2655 for (i = 0; bcf_sr_next_line_reader0(sr); i++) {
2656 bcf1_t *line = bcf_sr_get_line(sr, 0);
2657 if (model->filter) {
2658 int ret = filter_test(model->filter, line, NULL);
2659 if ((model->filter_logic == FLT_INCLUDE && !ret) || ret) continue;
2660 }
2661 if (rid != line->rid) break;
2662 int pos = line->pos + 1;
2663
2664 hts_expand(locus_t, i + 1, model->m_locus, model->locus_arr);
2665
2666 model->locus_arr[i].pos = pos;
2667
2668 hts_expand(float, i + 1, model->m_gc, model->gc_arr);
2669 if (model->gc_id >= 0 && (info = bcf_get_info_id(line, model->gc_id)))
2670 model->gc_arr[i] = info->v1.f;
2671 else
2672 model->gc_arr[i] = NAN;
2673
2674 // if failing inclusion/exclusion requirement, skip line
2675 if ((model->flags & FLT_EXCLUDE) && bcf_sr_get_line(sr, 1)) continue;
2676 if ((model->flags & FLT_INCLUDE) && !bcf_sr_get_line(sr, 1)) continue;
2677
2678 // if site falls in short arm or centromere regions skip line
2679 if (!(model->flags & USE_SHORT_ARMS) && model->genome_rules->is_short_arm[rid]
2680 && pos < model->genome_rules->cen_beg[rid])
2681 continue;
2682 if (!(model->flags & USE_CENTROMERES) && pos > model->genome_rules->cen_beg[rid]
2683 && pos < model->genome_rules->cen_end[rid])
2684 continue;
2685
2686 if (model->mhc_idx
2687 && regidx_overlap(model->mhc_idx, bcf_hdr_id2name(hdr, line->rid), line->pos, line->pos + 1, NULL))
2688 continue;
2689 if (model->kir_idx
2690 && regidx_overlap(model->kir_idx, bcf_hdr_id2name(hdr, line->rid), line->pos, line->pos + 1, NULL))
2691 continue;
2692
2693 bcf_fmt_t *gt_fmt = bcf_get_fmt_id(line, model->gt_id);
2694 if (!bcf_get_genotype_phase(gt_fmt, phase_arr, nsmpl)) continue;
2695
2696 // if neither AD nor LRR and BAF formats are present, skip line
2697 if (model->flags & WGS_DATA) {
2698 if (!bcf_get_unphased_genotype_alleles(gt_fmt, gt0, gt1, nsmpl)) continue;
2699 if (!bcf_get_allelic_depth(bcf_get_fmt_id(line, model->ad_id), gt0, gt1, ad0, ad1, nsmpl)) continue;
2700 } else {
2701 if ((info = bcf_get_info_id(line, model->allele_a_id)))
2702 model->locus_arr[i].allele_a = info->v1.i;
2703 else
2704 error("Error: ALLELE_A missing at position %s:%" PRId64 "\n", bcf_hdr_id2name(hdr, line->rid),
2705 line->pos + 1);
2706 if ((info = bcf_get_info_id(line, model->allele_b_id)))
2707 model->locus_arr[i].allele_b = info->v1.i;
2708 else
2709 error("Error: ALLELE_B missing at position %s:%" PRId64 "\n", bcf_hdr_id2name(hdr, line->rid),
2710 line->pos + 1);
2711 // missing ALLELE_A and ALLELE_B information
2712 if (model->locus_arr[i].allele_a < 0 || model->locus_arr[i].allele_b < 0) continue;
2713 // if there are no genotypes, skip line
2714 if (bcf_get_ab_genotypes(gt_fmt, gts, nsmpl, model->locus_arr[i].allele_a, model->locus_arr[i].allele_b)
2715 < 0)
2716 error("Error: site %s:%" PRId64 " contains non-AB alleles\n", bcf_hdr_id2name(hdr, line->rid),
2717 line->pos + 1);
2718
2719 if (!(lrr_fmt = bcf_get_fmt_id(line, model->lrr_id)) || !(baf_fmt = bcf_get_fmt_id(line, model->baf_id)))
2720 continue;
2721
2722 for (int j = 0; j < nsmpl; j++) {
2723 if (bcf_float_is_missing(((float *)lrr_fmt->p)[sample[j].idx]))
2724 ((float *)lrr_fmt->p)[sample[j].idx] = NAN;
2725 if (bcf_float_is_missing(((float *)baf_fmt->p)[sample[j].idx]))
2726 ((float *)baf_fmt->p)[sample[j].idx] = NAN;
2727 }
2728
2729 int is_x_nonpar = rid == model->genome_rules->x_rid && pos > model->genome_rules->x_nonpar_beg
2730 && pos < model->genome_rules->x_nonpar_end
2731 && (pos < model->genome_rules->x_xtr_beg || pos > model->genome_rules->x_xtr_end);
2732 int is_y_or_mt = rid == model->genome_rules->y_rid || rid == model->genome_rules->mt_rid;
2733
2734 // adjust cluster centers and slopes, inspired by
2735 // (i) Staaf, J. et al. Normalization of Illumina Infinium whole-genome
2736 // SNP data improves copy number estimates and allelic intensity ratios.
2737 // BMC Bioinformatics 9, 409 (2008) & (ii) Mayrhofer, M., Viklund, B. &
2738 // Isaksson, A. Rawcopy: Improved copy number analysis with Affymetrix
2739 // arrays. Sci Rep 6, 36158 (2016)
2740 for (int gt = GT_AA; gt <= GT_BB; gt++) {
2741 if (is_y_or_mt) continue;
2742 int k = 0;
2743 for (int j = 0; j < nsmpl; j++) {
2744 if (is_x_nonpar && sample[j].computed_gender == GENDER_MALE) continue;
2745 if (gts[sample[j].idx] == gt) imap_arr[k++] = sample[j].idx;
2746 }
2747 float baf_b = 0.0f, baf_m = 0.0f, lrr_b = 0.0f;
2748 if (model->regress_baf_lrr != -1 && model->regress_baf_lrr <= k) {
2749 float xss = 0.0f, yss = 0.0f, xyss = 0.0f;
2750 get_cov((float *)lrr_fmt->p, (float *)baf_fmt->p, k, imap_arr, &xss, &yss, &xyss);
2751 baf_m = xyss / xss;
2752 for (int j = 0; j < k; j++)
2753 ((float *)baf_fmt->p)[imap_arr[j]] -= baf_m * ((float *)lrr_fmt->p)[imap_arr[j]];
2754 }
2755 if (model->adj_baf_lrr != -1 && model->adj_baf_lrr <= k) {
2756 baf_b = get_median((float *)baf_fmt->p, k, imap_arr) - (float)(gt - 1) * 0.5f;
2757 if (isnan(baf_b)) baf_b = 0.0f;
2758 for (int j = 0; j < k; j++) ((float *)baf_fmt->p)[imap_arr[j]] -= baf_b;
2759 lrr_b = get_median((float *)lrr_fmt->p, k, imap_arr);
2760 if (isnan(lrr_b)) lrr_b = 0.0f;
2761 for (int j = 0; j < k; j++) ((float *)lrr_fmt->p)[imap_arr[j]] -= lrr_b;
2762 }
2763 // corrects males on X nonPAR
2764 if (is_x_nonpar) {
2765 for (int j = 0; j < nsmpl; j++) {
2766 if (sample[j].computed_gender != GENDER_MALE) continue;
2767 if (gts[sample[j].idx] == gt) {
2768 ((float *)baf_fmt->p)[sample[j].idx] -=
2769 baf_m * ((float *)lrr_fmt->p)[sample[j].idx] + baf_b;
2770 ((float *)lrr_fmt->p)[sample[j].idx] -= lrr_b;
2771 }
2772 }
2773 }
2774 model->locus_arr[i].adjust[gt - 1][0] = baf_b;
2775 model->locus_arr[i].adjust[gt - 1][1] = baf_m;
2776 model->locus_arr[i].adjust[gt - 1][2] = lrr_b;
2777 }
2778
2779 // if allele A index is bigger than allele B index flip the BAF to make
2780 // sure it refers to the highest allele
2781 if (model->locus_arr[i].allele_a > model->locus_arr[i].allele_b) {
2782 for (int j = 0; j < nsmpl; j++)
2783 ((float *)baf_fmt->p)[sample[j].idx] = 1.0f - ((float *)baf_fmt->p)[sample[j].idx];
2784 }
2785
2786 for (int j = 0; j < nsmpl; j++)
2787 if (gts[j] != GT_AB || (is_x_nonpar && sample[j].computed_gender == GENDER_MALE) || is_y_or_mt)
2788 ((float *)baf_fmt->p)[sample[j].idx] = NAN;
2789 }
2790
2791 // read line in memory
2792 model->n++;
2793 for (int j = 0; j < nsmpl; j++) {
2794 if (model->flags & WGS_DATA) {
2795 if (ad0[j] == bcf_int16_missing && ad1[j] == bcf_int16_missing) continue;
2796
2797 // site too close to last het site or hom site too close to last site
2798 if ((pos < last_het_pos[j] + model->min_dst)
2799 || ((phase_arr[j] == bcf_int8_missing || phase_arr[j] == bcf_int8_vector_end)
2800 && (pos < last_pos[j] + model->min_dst)))
2801 continue;
2802
2803 // substitute the last hom site with the current het site
2804 if (pos < last_pos[j] + model->min_dst) sample[j].n--;
2805
2806 if (phase_arr[j] != bcf_int8_missing || phase_arr[j] == bcf_int8_vector_end) last_het_pos[j] = pos;
2807 last_pos[j] = pos;
2808
2809 sample[j].n++;
2810 hts_expand(int, sample[j].n, sample[j].m_vcf_imap, sample[j].vcf_imap_arr);
2811 hts_expand(int8_t, sample[j].n, sample[j].m_phase, sample[j].phase_arr);
2812 hts_expand(int16_t, sample[j].n, sample[j].m_data[AD0], sample[j].data_arr[AD0]);
2813 hts_expand(int16_t, sample[j].n, sample[j].m_data[AD1], sample[j].data_arr[AD1]);
2814 sample[j].vcf_imap_arr[sample[j].n - 1] = i;
2815 sample[j].phase_arr[sample[j].n - 1] = phase_arr[j];
2816 sample[j].data_arr[AD0][sample[j].n - 1] = ad0[j];
2817 sample[j].data_arr[AD1][sample[j].n - 1] = ad1[j];
2818
2819 } else {
2820 float lrr = ((float *)lrr_fmt->p)[sample[j].idx];
2821 float baf = ((float *)baf_fmt->p)[sample[j].idx];
2822 if (isnan(lrr) && isnan(baf)) continue;
2823
2824 sample[j].n++;
2825 hts_expand(int, sample[j].n, sample[j].m_vcf_imap, sample[j].vcf_imap_arr);
2826 hts_expand(int8_t, sample[j].n, sample[j].m_phase, sample[j].phase_arr);
2827 hts_expand(int16_t, sample[j].n, sample[j].m_data[LRR], sample[j].data_arr[LRR]);
2828 hts_expand(int16_t, sample[j].n, sample[j].m_data[BAF], sample[j].data_arr[BAF]);
2829 sample[j].vcf_imap_arr[sample[j].n - 1] = i;
2830 sample[j].phase_arr[sample[j].n - 1] = phase_arr[j];
2831 sample[j].data_arr[LRR][sample[j].n - 1] = float_to_int16(lrr);
2832 sample[j].data_arr[BAF][sample[j].n - 1] = float_to_int16(baf);
2833 }
2834 }
2835 }
2836 free(gts);
2837 free(phase_arr);
2838 free(imap_arr);
2839 free(gt0);
2840 free(gt1);
2841 free(ad0);
2842 free(ad1);
2843 free(last_het_pos);
2844 free(last_pos);
2845
2846 return i;
2847 }
2848
read_stats(sample_t * samples,const bcf_hdr_t * hdr,const char * fn,int lrr_gc_order,int flags)2849 static int read_stats(sample_t *samples, const bcf_hdr_t *hdr, const char *fn, int lrr_gc_order, int flags) {
2850 htsFile *fp = hts_open(fn, "r");
2851 if (fp == NULL) error("Could not open %s: %s\n", fn, strerror(errno));
2852
2853 kstring_t str = {0, 0, NULL};
2854 if (hts_getline(fp, KS_SEP_LINE, &str) <= 0) error("Empty file: %s\n", fn);
2855 tsv_t *tsv = tsv_init_delimiter(str.s, '\t');
2856 sample_t sample;
2857 memset((void *)&sample, 0, sizeof(sample));
2858 if (tsv_register(tsv, "sample_id", tsv_read_sample_id, (void *)&sample.idx) < 0)
2859 error("File %s is missing the sample_id column\n", fn);
2860 if (tsv_register(tsv, "computed_gender", tsv_read_computed_gender, (void *)&sample.computed_gender) < 0)
2861 error("File %s is missing the computed_gender column\n", fn);
2862 if (tsv_register(tsv, "call_rate", tsv_read_float, (void *)&sample.stats.call_rate) < 0)
2863 error("File %s is missing the call_rate column\n", fn);
2864 int lrr_median = tsv_register(tsv, flags & WGS_DATA ? "cov_median" : "lrr_median", tsv_read_float,
2865 (void *)&sample.stats.lrr_median);
2866 int lrr_sd =
2867 tsv_register(tsv, flags & WGS_DATA ? "cov_sd" : "lrr_sd", tsv_read_float, (void *)&sample.stats.lrr_sd);
2868 if (tsv_register(tsv, flags & WGS_DATA ? "cov_auto" : "lrr_auto", tsv_read_float, (void *)&sample.stats.lrr_auto)
2869 < 0)
2870 sample.stats.lrr_auto = NAN;
2871 int baf_sd =
2872 tsv_register(tsv, flags & WGS_DATA ? "baf_corr" : "baf_sd", tsv_read_float, (void *)&sample.stats.dispersion);
2873 int baf_conc = tsv_register(tsv, "baf_conc", tsv_read_float, (void *)&sample.stats.baf_conc);
2874 if (tsv_register(tsv, "baf_auto", tsv_read_float, (void *)&sample.stats.baf_auto) < 0) sample.stats.baf_auto = NAN;
2875 tsv_register(tsv, "n_sites", tsv_read_integer, (void *)&sample.n_sites);
2876 tsv_register(tsv, "n_hets", tsv_read_integer, (void *)&sample.n_hets);
2877 tsv_register(tsv, "x_nonpar_n_hets", tsv_read_integer, (void *)&sample.x_nonpar_n_hets);
2878 tsv_register(tsv, "par1_n_hets", tsv_read_integer, (void *)&sample.par1_n_hets);
2879 tsv_register(tsv, "xtr_n_hets", tsv_read_integer, (void *)&sample.xtr_n_hets);
2880 tsv_register(tsv, "par2_n_hets", tsv_read_integer, (void *)&sample.par2_n_hets);
2881 if (tsv_register(tsv, "x_nonpar_baf_corr", tsv_read_float, (void *)&sample.x_nonpar_dispersion) < 0)
2882 sample.x_nonpar_dispersion = NAN;
2883 if (tsv_register(tsv, flags & WGS_DATA ? "x_nonpar_cov_median" : "x_nonpar_lrr_median", tsv_read_float,
2884 (void *)&sample.x_nonpar_lrr_median)
2885 < 0)
2886 sample.x_nonpar_lrr_median = NAN;
2887 if (tsv_register(tsv, flags & WGS_DATA ? "y_nonpar_cov_median" : "y_nonpar_lrr_median", tsv_read_float,
2888 (void *)&sample.y_nonpar_lrr_median)
2889 < 0)
2890 sample.y_nonpar_lrr_median = NAN;
2891 if (tsv_register(tsv, flags & WGS_DATA ? "mt_cov_median" : "mt_lrr_median", tsv_read_float,
2892 (void *)&sample.mt_lrr_median)
2893 < 0)
2894 sample.mt_lrr_median = NAN;
2895 int lrr_gc_rel_ess = tsv_register(tsv, "lrr_gc_rel_ess", tsv_read_float, (void *)&sample.stats.lrr_gc_rel_ess);
2896 if (lrr_sd < 0 || lrr_gc_rel_ess < 0) sample.adjlrr_sd = NAN;
2897 int lrr_gc_coeffs = 0;
2898 for (int i = 0; i <= lrr_gc_order; i++) {
2899 str.l = 0;
2900 ksprintf(&str, "lrr_gc_%d", i);
2901 if (!(tsv_register(tsv, str.s, tsv_read_float, (void *)&sample.stats.coeffs[i]) < 0)) lrr_gc_coeffs++;
2902 }
2903 int i = 0;
2904 while (hts_getline(fp, KS_SEP_LINE, &str) > 0) {
2905 if (!tsv_parse_delimiter(tsv, (bcf1_t *)hdr, str.s, '\t')) {
2906 int idx = sample.idx;
2907 if (idx < 0) continue;
2908 if (flags & WGS_DATA) {
2909 sample.stats.lrr_sd = sample.stats.lrr_sd / sample.stats.lrr_median;
2910 sample.stats.lrr_median = logf(sample.stats.lrr_median);
2911 sample.x_nonpar_lrr_median = logf(sample.x_nonpar_lrr_median);
2912 sample.y_nonpar_lrr_median = logf(sample.y_nonpar_lrr_median);
2913 sample.mt_lrr_median = logf(sample.mt_lrr_median);
2914 }
2915 if (!(lrr_sd < 0)) {
2916 sample.adjlrr_sd = sample.stats.lrr_sd;
2917 if (!(lrr_gc_rel_ess < 0)) sample.adjlrr_sd *= sqrtf(1.0f - sample.stats.lrr_gc_rel_ess);
2918 }
2919 memcpy((void *)&samples[idx], (const void *)&sample, sizeof(sample_t));
2920 i++;
2921 } else {
2922 error("Could not parse line: %s\n", str.s);
2923 }
2924 }
2925 tsv_destroy(tsv);
2926 free(str.s);
2927 hts_close(fp);
2928 if (i < bcf_hdr_nsamples(hdr)) error("File %s is missing samples\n", fn);
2929 int ret = (lrr_median < 0) || (lrr_sd < 0) || (baf_sd < 0) || (baf_conc < 0) || (lrr_gc_rel_ess < 0)
2930 || (lrr_gc_coeffs != lrr_gc_order + 1);
2931 return ret;
2932 }
2933
2934 /*********************************
2935 * PLUGIN CODE *
2936 *********************************/
2937
about(void)2938 const char *about(void) { return "Runs SNP-based method for detection of MOsaic CHromosomal Alterations (MoChA).\n"; }
2939
usage_text(void)2940 static const char *usage_text(void) {
2941 return "\n"
2942 "About: MOsaic CHromosomal Alterations caller, requires phased genotypes (GT)\n"
2943 " and either B-allele frequency (BAF) and Log R Ratio intensity (LRR)\n"
2944 " or allelic depth coverage (AD). (version " MOCHA_VERSION
2945 " https://github.com/freeseek/mocha)\n"
2946 "Usage: bcftools +mocha [OPTIONS] <in.vcf.gz>\n"
2947 "\n"
2948 "Required options:\n"
2949 " -g, --genome <assembly>[?] predefined genome reference rules, 'list' to print available settings, "
2950 "append '?' for details\n"
2951 " -G, --genome-file <file> genome reference rules, space/tab-delimited CHROM:FROM-TO,TYPE\n"
2952 "\n"
2953 "General Options:\n"
2954 " -v, --variants [^]<file> tabix-indexed [compressed] VCF/BCF file containing variants\n"
2955 " -f, --apply-filters <list> require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n"
2956 " to include (or exclude with \"^\" prefix) in the analysis\n"
2957 " -e, --exclude <expr> exclude sites for which the expression is true\n"
2958 " -i, --include <expr> select sites for which the expression is true\n"
2959 " -r, --regions <region> restrict to comma-separated list of regions\n"
2960 " -R, --regions-file <file> restrict to regions listed in a file\n"
2961 " -t, --targets [^]<region> restrict to comma-separated list of regions. Exclude regions with \"^\" "
2962 "prefix\n"
2963 " -T, --targets-file [^]<file> restrict to regions listed in a file. Exclude regions with \"^\" "
2964 "prefix\n"
2965 " -s, --samples [^]<list> comma separated list of samples to include (or exclude with \"^\" "
2966 "prefix)\n"
2967 " -S, --samples-file [^]<file> file of samples to include (or exclude with \"^\" prefix)\n"
2968 " --force-samples only warn about unknown subset samples\n"
2969 " --input-stats <file> input samples genome-wide statistics file\n"
2970 " --only-stats compute genome-wide statistics without detecting mosaic chromosomal "
2971 "alterations\n"
2972 " -p --cnp <file> list of regions to genotype in BED format\n"
2973 " --mhc <region> MHC region to exclude from analysis (will be retained in the output)\n"
2974 " --kir <region> KIR region to exclude from analysis (will be retained in the output)\n"
2975 " --threads <int> number of extra output compression threads [0]\n"
2976 "\n"
2977 "Output Options:\n"
2978 " -o, --output <file> write output to a file [no output]\n"
2979 " -O, --output-type <b|u|z|v> b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: "
2980 "uncompressed VCF [v]\n"
2981 " --no-version do not append version and command line to the header\n"
2982 " -a --no-annotations omit Ldev and Bdev FORMAT from output VCF (requires --output)\n"
2983 " --no-log suppress progress report on standard error\n"
2984 " -l --log <file> write log to file [standard error]\n"
2985 " -c, --calls <file> write chromosomal alterations calls table to a file [standard output]\n"
2986 " -z --stats <file> write samples genome-wide statistics table to a file [no output]\n"
2987 " -u, --ucsc-bed <file> write UCSC bed track to a file [no output]\n"
2988 "\n"
2989 "HMM Options:\n"
2990 " --bdev-LRR-BAF <list> comma separated list of inverse BAF deviations for LRR+BAF model "
2991 "[" BDEV_LRR_BAF_DFLT
2992 "]\n"
2993 " --bdev-BAF-phase <list> comma separated list of inverse BAF deviations for BAF+phase model\n"
2994 " [" BDEV_BAF_PHASE_DFLT
2995 "]\n"
2996 " --min-dist <int> minimum base pair distance between consecutive sites for WGS data "
2997 "[" MIN_DST_DFLT
2998 "]\n"
2999 " --adjust-BAF-LRR <int> minimum number of genotypes for a cluster to median adjust BAF and LRR "
3000 "(-1 for no adjustment) [" ADJ_BAF_LRR_DFLT
3001 "]\n"
3002 " --regress-BAF-LRR <int> minimum number of genotypes for a cluster to regress BAF against LRR "
3003 "(-1 for no regression) [" REGRESS_BAF_LRR_DFLT
3004 "]\n"
3005 " --LRR-GC-order <int> order of polynomial to regress LRR against local GC content (-1 for no "
3006 "regression) [" LRR_GC_ORDER_DFLT
3007 "]\n"
3008 " --xy-major-pl major transition phred-scaled likelihood [" XY_MAJOR_PL_DFLT
3009 "]\n"
3010 " --xy-minor-pl minor transition phred-scaled likelihood [" XY_MINOR_PL_DFLT
3011 "]\n"
3012 " --auto-tel-pl autosomal telomeres phred-scaled likelihood [" AUTO_TEL_PL_DFLT
3013 "]\n"
3014 " --chrX-tel-pl chromosome X telomeres phred-scaled likelihood [" CHRX_TEL_PL_DFLT
3015 "]\n"
3016 " --chrY-tel-pl chromosome Y telomeres phred-scaled likelihood [" CHRY_TEL_PL_DFLT
3017 "]\n"
3018 " --error-pl uniform error phred-scaled likelihood [" ERR_PL_DFLT
3019 "]\n"
3020 " --flip-pl phase flip phred-scaled likelihood [" FLIP_PL_DFLT
3021 "]\n"
3022 " --short-arm-chrs <list> list of chromosomes with short arms [" SHORT_ARM_CHRS_DFLT
3023 "]\n"
3024 " --use-short-arms use variants in short arms [FALSE]\n"
3025 " --use-centromeres use variants in centromeres [FALSE]\n"
3026 " --use-no-rules-chrs use chromosomes without centromere rules [FALSE]\n"
3027 " --LRR-weight <float> relative contribution from LRR for LRR+BAF model [" LRR_BIAS_DFLT
3028 "]\n"
3029 " --LRR-hap2dip <float> difference between LRR for haploid and diploid [" LRR_HAP2DIP_DFLT
3030 "]\n"
3031 " --LRR-cutoff <float> cutoff between LRR for haploid and diploid used to infer gender "
3032 "[estimated from X nonPAR]\n"
3033 "\n"
3034 "Examples:\n"
3035 " bcftools +mocha -g GRCh37 -v ^exclude.bcf -p cnps.bed -c calls.tsv -z stats.tsv input.bcf\n"
3036 " bcftools +mocha -g GRCh38 -o output.bcf -Ob -c calls.tsv -z stats.tsv --LRR-weight 0.5 input.bcf\n"
3037 "\n";
3038 }
3039
read_list_invf(const char * str,int * n,float min,float max)3040 static float *read_list_invf(const char *str, int *n, float min, float max) {
3041 char *tmp, **list = hts_readlist(str, 0, n);
3042 if (*n >= 128) error("Cannot handle list of 128 or more parameters: %s\n", str);
3043 float *ret = (float *)malloc(*n * sizeof(float));
3044 for (int i = 0; i < *n; i++) {
3045 ret[i] = 1.0f / strtof(list[i], &tmp);
3046 if (*tmp) error("Could not parse: %s\n", list[i]);
3047 if (ret[i] < min || ret[i] > max)
3048 error("Expected values from the interval [%f,%f], found 1/%s\n", min, max, list[i]);
3049 free(list[i]);
3050 }
3051 free(list);
3052 ks_introsort_float((size_t)*n, ret);
3053 return ret;
3054 }
3055
cnp_parse(const char * line,char ** chr_beg,char ** chr_end,uint32_t * beg,uint32_t * end,void * payload,void * usr)3056 static int cnp_parse(const char *line, char **chr_beg, char **chr_end, uint32_t *beg, uint32_t *end, void *payload,
3057 void *usr) {
3058 // Use the standard parser for CHROM,FROM,TO
3059 int ret = regidx_parse_bed(line, chr_beg, chr_end, beg, end, NULL, NULL);
3060 if (ret != 0) return ret;
3061
3062 // Skip the fields that were parsed above
3063 char *ss = (char *)line;
3064 while (*ss && isspace(*ss)) ss++;
3065 for (int i = 0; i < 3; i++) {
3066 while (*ss && !isspace(*ss)) ss++;
3067 if (!*ss) return -2; // wrong number of fields
3068 while (*ss && isspace(*ss)) ss++;
3069 }
3070 if (!*ss) return -2;
3071
3072 // Parse the payload
3073 int *dat = (int *)payload;
3074 if (strncmp(ss, "DEL", 3) == 0)
3075 *dat = MOCHA_CNP_LOSS;
3076 else if (strncmp(ss, "DUP", 3) == 0)
3077 *dat = MOCHA_CNP_GAIN;
3078 else if (strncmp(ss, "CNV", 3) == 0)
3079 *dat = MOCHA_CNP_CNV;
3080 else
3081 *dat = MOCHA_UNDET;
3082 return 0;
3083 }
3084
get_file_handle(const char * str)3085 static FILE *get_file_handle(const char *str) {
3086 FILE *ret;
3087 if (strcmp(str, "-") == 0)
3088 ret = stdout;
3089 else {
3090 ret = fopen(str, "w");
3091 if (!ret) error("Failed to open %s: %s\n", str, strerror(errno));
3092 }
3093 return ret;
3094 }
3095
run(int argc,char * argv[])3096 int run(int argc, char *argv[]) {
3097 beta_binom_null = beta_binom_init();
3098 beta_binom_alt = beta_binom_init();
3099
3100 // program options
3101 char *tmp = NULL;
3102 int rules_is_file = 0;
3103 int only_stats = 0;
3104 int regions_is_file = 0;
3105 int targets_is_file = 0;
3106 int sample_is_file = 0;
3107 int force_samples = 0;
3108 int output_type = FT_VCF;
3109 int n_threads = 0;
3110 int record_cmd_line = 1;
3111 const char *filter_fname = NULL;
3112 const char *filter_str = NULL;
3113 const char *regions_list = NULL;
3114 const char *targets_list = NULL;
3115 const char *sample_names = NULL;
3116 const char *output_fname = NULL;
3117 const char *cnp_fname = NULL;
3118 const char *stats_fname = NULL;
3119 const char *mhc_reg = NULL;
3120 const char *kir_reg = NULL;
3121 char *rules = NULL;
3122 sample_t *sample = NULL;
3123 FILE *log_file = stderr;
3124 FILE *out_fc = stdout;
3125 FILE *out_fz = NULL;
3126 FILE *out_fu = NULL;
3127 bcf_hdr_t *hdr = NULL;
3128 bcf_hdr_t *out_hdr = NULL;
3129 htsFile *out_fh = NULL;
3130 mocha_table_t mocha_table = {0, 0, NULL};
3131 const char *short_arm_chrs = SHORT_ARM_CHRS_DFLT;
3132 const char *bdev_lrr_baf = BDEV_LRR_BAF_DFLT;
3133 const char *bdev_baf_phase = BDEV_BAF_PHASE_DFLT;
3134
3135 // model parameters
3136 model_t model;
3137 memset(&model, 0, sizeof(model_t));
3138 model.xy_major_log_prb = -strtof(XY_MAJOR_PL_DFLT, NULL) / 10.0f * M_LN10;
3139 model.xy_minor_log_prb = -strtof(XY_MINOR_PL_DFLT, NULL) / 10.0f * M_LN10;
3140 model.auto_tel_log_prb = -strtof(AUTO_TEL_PL_DFLT, NULL) / 10.0f * M_LN10;
3141 model.chrX_tel_log_prb = -strtof(CHRX_TEL_PL_DFLT, NULL) / 10.0f * M_LN10;
3142 model.chrY_tel_log_prb = -strtof(CHRY_TEL_PL_DFLT, NULL) / 10.0f * M_LN10;
3143 model.err_log_prb = -strtof(ERR_PL_DFLT, NULL) / 10.0f * M_LN10;
3144 model.flip_log_prb = -strtof(FLIP_PL_DFLT, NULL) / 10.0f * M_LN10;
3145 model.min_dst = (int)strtol(MIN_DST_DFLT, NULL, 0);
3146 model.lrr_bias = strtof(LRR_BIAS_DFLT, NULL);
3147 model.lrr_hap2dip = strtof(LRR_HAP2DIP_DFLT, NULL);
3148 model.lrr_cutoff = NAN;
3149 model.adj_baf_lrr = (int)strtol(ADJ_BAF_LRR_DFLT, NULL, 0);
3150 model.regress_baf_lrr = (int)strtol(REGRESS_BAF_LRR_DFLT, NULL, 0);
3151 model.lrr_gc_order = (int)strtol(LRR_GC_ORDER_DFLT, NULL, 0);
3152
3153 // create synced reader object
3154 bcf_srs_t *sr = bcf_sr_init();
3155 bcf_sr_set_opt(sr, BCF_SR_REQUIRE_IDX);
3156
3157 static struct option loptions[] = {{"genome", required_argument, NULL, 'g'},
3158 {"genome-file", required_argument, NULL, 'G'},
3159 {"variants", required_argument, NULL, 'v'},
3160 {"apply-filters", required_argument, NULL, 'f'},
3161 {"exclude", required_argument, NULL, 'e'},
3162 {"include", required_argument, NULL, 'i'},
3163 {"regions", required_argument, NULL, 'r'},
3164 {"regions-file", required_argument, NULL, 'R'},
3165 {"targets", required_argument, NULL, 't'},
3166 {"targets-file", required_argument, NULL, 'T'},
3167 {"samples", required_argument, NULL, 's'},
3168 {"samples-file", required_argument, NULL, 'S'},
3169 {"force-samples", no_argument, NULL, 1},
3170 {"cnp", required_argument, NULL, 'p'},
3171 {"input-stats", required_argument, NULL, 2},
3172 {"only-stats", no_argument, NULL, 3},
3173 {"mhc", required_argument, NULL, 4},
3174 {"kir", required_argument, NULL, 5},
3175 {"threads", required_argument, NULL, 9},
3176 {"output", required_argument, NULL, 'o'},
3177 {"output-type", required_argument, NULL, 'O'},
3178 {"no-version", no_argument, NULL, 8},
3179 {"no-annotations", no_argument, NULL, 'a'},
3180 {"calls", required_argument, NULL, 'c'},
3181 {"stats", required_argument, NULL, 'z'},
3182 {"ucsc-bed", required_argument, NULL, 'u'},
3183 {"log", required_argument, NULL, 'l'},
3184 {"no-log", no_argument, NULL, 10},
3185 {"bdev-LRR-BAF", required_argument, NULL, 11},
3186 {"bdev-BAF-phase", required_argument, NULL, 12},
3187 {"min-dist", required_argument, NULL, 13},
3188 {"adjust-BAF-LRR", required_argument, NULL, 14},
3189 {"regress-BAF-LRR", required_argument, NULL, 15},
3190 {"LRR-GC-order", required_argument, NULL, 16},
3191 {"xy-major-pl", required_argument, NULL, 17},
3192 {"xy-minor-pl", required_argument, NULL, 18},
3193 {"auto-tel-pl", required_argument, NULL, 19},
3194 {"chrX-tel-pl", required_argument, NULL, 20},
3195 {"chrY-tel-pl", required_argument, NULL, 21},
3196 {"error-pl", required_argument, NULL, 22},
3197 {"flip-pl", required_argument, NULL, 23},
3198 {"short-arm-chrs", required_argument, NULL, 24},
3199 {"use-short-arms", no_argument, NULL, 25},
3200 {"use-centromeres", no_argument, NULL, 26},
3201 {"use-no-rules-chrs", no_argument, NULL, 27},
3202 {"LRR-weight", required_argument, NULL, 28},
3203 {"LRR-hap2dip", required_argument, NULL, 29},
3204 {"LRR-cutoff", required_argument, NULL, 30},
3205 {NULL, 0, NULL, 0}};
3206 int c;
3207 while ((c = getopt_long(argc, argv, "h?g:G:v:f:e:i:r:R:t:T:s:S:p:o:O:al:c:z:u:", loptions, NULL)) >= 0) {
3208 switch (c) {
3209 case 'g':
3210 rules = optarg;
3211 break;
3212 case 'G':
3213 rules = optarg;
3214 rules_is_file = 1;
3215 break;
3216 case 'v':
3217 if (optarg[0] == '^') {
3218 filter_fname = optarg + 1;
3219 model.flags |= FLT_EXCLUDE;
3220 } else {
3221 filter_fname = optarg;
3222 model.flags |= FLT_INCLUDE;
3223 }
3224 break;
3225 case 'f':
3226 sr->apply_filters = optarg;
3227 break;
3228 case 'e':
3229 filter_str = optarg;
3230 model.filter_logic |= FLT_EXCLUDE;
3231 break;
3232 case 'i':
3233 filter_str = optarg;
3234 model.filter_logic |= FLT_INCLUDE;
3235 break;
3236 case 'r':
3237 regions_list = optarg;
3238 break;
3239 case 'R':
3240 regions_list = optarg;
3241 regions_is_file = 1;
3242 break;
3243 case 't':
3244 targets_list = optarg;
3245 break;
3246 case 'T':
3247 targets_list = optarg;
3248 targets_is_file = 1;
3249 break;
3250 case 's':
3251 sample_names = optarg;
3252 break;
3253 case 'S':
3254 sample_names = optarg;
3255 sample_is_file = 1;
3256 break;
3257 case 1:
3258 force_samples = 1;
3259 break;
3260 case 2:
3261 stats_fname = optarg;
3262 break;
3263 case 3:
3264 only_stats = 1;
3265 break;
3266 case 'p':
3267 cnp_fname = optarg;
3268 break;
3269 case 4:
3270 mhc_reg = optarg;
3271 break;
3272 case 5:
3273 kir_reg = optarg;
3274 break;
3275 case 9:
3276 n_threads = (int)strtol(optarg, &tmp, 0);
3277 if (*tmp) error("Could not parse: --threads %s\n", optarg);
3278 break;
3279 case 'o':
3280 output_fname = optarg;
3281 break;
3282 case 'O':
3283 switch (optarg[0]) {
3284 case 'b':
3285 output_type = FT_BCF_GZ;
3286 break;
3287 case 'u':
3288 output_type = FT_BCF;
3289 break;
3290 case 'z':
3291 output_type = FT_VCF_GZ;
3292 break;
3293 case 'v':
3294 output_type = FT_VCF;
3295 break;
3296 default:
3297 error("The output type \"%s\" not recognised\n", optarg);
3298 };
3299 break;
3300 case 8:
3301 record_cmd_line = 0;
3302 break;
3303 case 'a':
3304 model.flags |= NO_ANNOT;
3305 break;
3306 case 'l':
3307 log_file = get_file_handle(optarg);
3308 break;
3309 case 10:
3310 model.flags |= NO_LOG;
3311 break;
3312 case 'c':
3313 out_fc = get_file_handle(optarg);
3314 break;
3315 case 'z':
3316 out_fz = get_file_handle(optarg);
3317 break;
3318 case 'u':
3319 out_fu = get_file_handle(optarg);
3320 break;
3321 case 11:
3322 bdev_lrr_baf = optarg;
3323 break;
3324 case 12:
3325 bdev_baf_phase = optarg;
3326 break;
3327 case 13:
3328 model.min_dst = (int)strtol(optarg, &tmp, 0);
3329 if (*tmp) error("Could not parse: --min-dist %s\n", optarg);
3330 break;
3331 case 14:
3332 model.adj_baf_lrr = (int)strtol(optarg, &tmp, 0);
3333 if (*tmp) error("Could not parse: --adjust-BAF-LRR %s\n", optarg);
3334 break;
3335 case 15:
3336 model.regress_baf_lrr = (int)strtol(optarg, &tmp, 0);
3337 if (*tmp) error("Could not parse: --regress-BAF-LRR %s\n", optarg);
3338 break;
3339 case 16:
3340 model.lrr_gc_order = (int)strtol(optarg, &tmp, 0);
3341 if (*tmp) error("Could not parse: --LRR-GC-order %s\n", optarg);
3342 break;
3343 case 17:
3344 model.xy_major_log_prb = -strtof(optarg, &tmp) / 10.0f * M_LN10;
3345 if (*tmp) error("Could not parse: --xy-major-pl %s\n", optarg);
3346 break;
3347 case 18:
3348 model.xy_minor_log_prb = -strtof(optarg, &tmp) / 10.0f * M_LN10;
3349 if (*tmp) error("Could not parse: --xy-minor-pl %s\n", optarg);
3350 break;
3351 case 19:
3352 model.auto_tel_log_prb = -strtof(optarg, &tmp) / 10.0f * M_LN10;
3353 if (*tmp) error("Could not parse: --auto-tel-pl %s\n", optarg);
3354 break;
3355 case 20:
3356 model.chrX_tel_log_prb = -strtof(optarg, &tmp) / 10.0f * M_LN10;
3357 if (*tmp) error("Could not parse: --chrX-tel-pl %s\n", optarg);
3358 break;
3359 case 21:
3360 model.chrY_tel_log_prb = -strtof(optarg, &tmp) / 10.0f * M_LN10;
3361 if (*tmp) error("Could not parse: --chrY-tel-pl %s\n", optarg);
3362 break;
3363 case 22:
3364 model.err_log_prb = -strtof(optarg, &tmp) / 10.0f * M_LN10;
3365 if (*tmp) error("Could not parse: --error-pl %s\n", optarg);
3366 break;
3367 case 23:
3368 model.flip_log_prb = -strtof(optarg, &tmp) / 10.0f * M_LN10;
3369 if (*tmp) error("Could not parse: --flip-pl %s\n", optarg);
3370 break;
3371 case 24:
3372 short_arm_chrs = optarg;
3373 break;
3374 case 25:
3375 model.flags |= USE_SHORT_ARMS;
3376 break;
3377 case 26:
3378 model.flags |= USE_CENTROMERES;
3379 break;
3380 case 27:
3381 model.flags |= USE_NO_RULES_CHRS;
3382 break;
3383 case 28:
3384 model.lrr_bias = strtof(optarg, &tmp);
3385 if (*tmp) error("Could not parse: --LRR-weight %s\n", optarg);
3386 break;
3387 case 29:
3388 model.lrr_hap2dip = strtof(optarg, &tmp);
3389 if (*tmp) error("Could not parse: --LRR-hap2dip %s\n", optarg);
3390 break;
3391 case 30:
3392 model.lrr_cutoff = strtof(optarg, &tmp);
3393 if (*tmp) error("Could not parse: --LRR-cutoff %s\n", optarg);
3394 break;
3395 case 'h':
3396 case '?':
3397 error("%s", usage_text());
3398 break;
3399 default:
3400 error("Unknown argument: %s\n", optarg);
3401 }
3402 }
3403
3404 if (!rules) {
3405 fprintf(log_file, "Genome reference assembly was not specified with --genome or --genome-file\n");
3406 error("%s", usage_text());
3407 }
3408 int len = strlen(rules);
3409 if (!rules_is_file && (strncmp(rules, "list", 4) == 0 || rules[len - 1] == '?'))
3410 genome_init_alias(log_file, rules, NULL);
3411
3412 if (model.filter_logic == (FLT_EXCLUDE | FLT_INCLUDE)) error("Only one of --include or --exclude can be given.\n");
3413
3414 if (only_stats) {
3415 if (!out_fz) {
3416 fprintf(log_file, "Option --only-stats requires output option --stats\n");
3417 error("%s", usage_text());
3418 }
3419 if (output_fname || out_fc != stdout || out_fu) {
3420 fprintf(log_file, "Cannot use --only-stats with output options --output, --calls, or --ucsc-bed\n");
3421 error("%s", usage_text());
3422 }
3423 }
3424
3425 if (output_fname == NULL && (model.flags & NO_ANNOT)) {
3426 fprintf(log_file, "Option --no-annotations requires option --output\n");
3427 error("%s", usage_text());
3428 }
3429
3430 if (model.lrr_gc_order > MAX_ORDER) {
3431 fprintf(log_file, "Polynomial order must not be greater than %d: --LRR-GC-order %d\n", MAX_ORDER,
3432 model.lrr_gc_order);
3433 error("%s", usage_text());
3434 }
3435
3436 if (model.xy_major_log_prb > model.xy_minor_log_prb) {
3437 fprintf(log_file,
3438 "Major transition phred-scaled likelihood should be greater than minor transition phred-scaled "
3439 "likelihood: --xy-major-pl %.2f --xy-minor-pl %.2f\n",
3440 -10.0f * M_LOG10E * model.xy_major_log_prb, -10.0f * M_LOG10E * model.xy_minor_log_prb);
3441 error("%s", usage_text());
3442 }
3443
3444 if (model.xy_minor_log_prb > model.auto_tel_log_prb) {
3445 fprintf(log_file,
3446 "Minor transition phred-scaled likelihood should be greater than autosomal telomeres phred-scaled "
3447 "likelihood: --xy-minor-pl %.2f --auto-tel-pl %.2f\n",
3448 -10.0f * M_LOG10E * model.xy_minor_log_prb, -10.0f * M_LOG10E * model.auto_tel_log_prb);
3449 error("%s", usage_text());
3450 }
3451
3452 if (model.xy_minor_log_prb > model.chrX_tel_log_prb) {
3453 fprintf(log_file,
3454 "Minor transition phred-scaled likelihood should be greater than chromosome X telomeres phred-scaled "
3455 "likelihood: --xy-minor-pl %.2f --chrX-tel-pl %.2f\n",
3456 -10.0f * M_LOG10E * model.xy_minor_log_prb, -10.0f * M_LOG10E * model.chrX_tel_log_prb);
3457 error("%s", usage_text());
3458 }
3459
3460 if (model.xy_minor_log_prb > model.chrY_tel_log_prb) {
3461 fprintf(log_file,
3462 "Minor transition phred-scaled likelihood should be greater than chromosome Y telomeres phred-scaled "
3463 "likelihood: --xy-minor-pl %.2f --chrY-tel-pl %.2f\n",
3464 -10.0f * M_LOG10E * model.xy_minor_log_prb, -10.0f * M_LOG10E * model.chrY_tel_log_prb);
3465 error("%s", usage_text());
3466 }
3467
3468 if (stats_fname && !isnan(model.lrr_cutoff)) {
3469 fprintf(log_file, "Cannot use options --input-stats and --LRR-cutoff together\n");
3470 error("%s", usage_text());
3471 }
3472
3473 // parse parameters defining hidden states
3474 model.bdev_lrr_baf = read_list_invf(bdev_lrr_baf, &model.bdev_lrr_baf_n, -0.5f, 0.25f);
3475 model.bdev_baf_phase = read_list_invf(bdev_baf_phase, &model.bdev_baf_phase_n, 0.0f, 0.5f);
3476
3477 // read list of regions to genotype
3478 if (cnp_fname) {
3479 model.cnp_idx = regidx_init(cnp_fname, cnp_parse, NULL, sizeof(int), NULL);
3480 if (!model.cnp_idx) error("Error: failed to initialize CNP regions: --cnp %s\n", cnp_fname);
3481 model.cnp_itr = regitr_init(model.cnp_idx);
3482 }
3483 if (mhc_reg) model.mhc_idx = regidx_init_string(mhc_reg, regidx_parse_reg, NULL, 0, NULL);
3484 if (kir_reg) model.kir_idx = regidx_init_string(kir_reg, regidx_parse_reg, NULL, 0, NULL);
3485
3486 // input VCF
3487 char *input_fname = NULL;
3488 if (optind >= argc) {
3489 if (!isatty(fileno((FILE *)stdin))) input_fname = "-";
3490 } else
3491 input_fname = argv[optind];
3492 if (!input_fname) error("%s", usage_text());
3493
3494 // read in the regions from the command line
3495 if (regions_list) {
3496 if (bcf_sr_set_regions(sr, regions_list, regions_is_file) < 0)
3497 error("Failed to read the regions: %s\n", regions_list);
3498 }
3499 if (targets_list) {
3500 if (bcf_sr_set_targets(sr, targets_list, targets_is_file, 0) < 0)
3501 error("Failed to read the targets: %s\n", targets_list);
3502 }
3503
3504 if (bcf_sr_set_threads(sr, n_threads) < 0) error("Failed to create threads\n");
3505 if (!bcf_sr_add_reader(sr, input_fname)) error("Failed to open %s: %s\n", input_fname, bcf_sr_strerror(sr->errnum));
3506 if (filter_fname)
3507 if (!bcf_sr_add_reader(sr, filter_fname))
3508 error("Failed to open %s: %s\n", filter_fname, bcf_sr_strerror(sr->errnum));
3509
3510 // check whether the necessary information has been included in the VCF
3511 hdr = bcf_sr_get_header(sr, 0);
3512 if (bcf_hdr_nsamples(hdr) == 0) error("Error: input VCF file has no samples\n");
3513 if (filter_str) model.filter = filter_init(hdr, filter_str);
3514
3515 model.allele_a_id = bcf_hdr_id2int(hdr, BCF_DT_ID, "ALLELE_A");
3516 if (!bcf_hdr_idinfo_exists(hdr, BCF_HL_INFO, model.allele_a_id)) model.allele_a_id = -1;
3517 if (model.allele_a_id >= 0 && bcf_hdr_id2type(hdr, BCF_HL_INFO, model.allele_a_id) != BCF_HT_INT)
3518 error("Error: input VCF file ALLELE_A info field is not of integer type\n");
3519
3520 model.allele_b_id = bcf_hdr_id2int(hdr, BCF_DT_ID, "ALLELE_B");
3521 if (!bcf_hdr_idinfo_exists(hdr, BCF_HL_INFO, model.allele_a_id)) model.allele_b_id = -1;
3522 if (model.allele_b_id >= 0 && bcf_hdr_id2type(hdr, BCF_HL_INFO, model.allele_b_id) != BCF_HT_INT)
3523 error("Error: input VCF file ALLELE_B info field is not of float type\n");
3524
3525 model.gc_id = bcf_hdr_id2int(hdr, BCF_DT_ID, "GC");
3526 if (!bcf_hdr_idinfo_exists(hdr, BCF_HL_INFO, model.gc_id)) model.gc_id = -1;
3527 if (model.lrr_gc_order > 0 && model.gc_id < 0)
3528 error(
3529 "Error: input VCF has no GC info field: use \"--LRR-GC-order 0/-1\" to disable LRR adjustment through GC "
3530 "correction\nor use bcftools +mochatools -- -t GC to infer GC content\n");
3531
3532 model.gt_id = bcf_hdr_id2int(hdr, BCF_DT_ID, "GT");
3533 if (!bcf_hdr_idinfo_exists(hdr, BCF_HL_FMT, model.gt_id)) error("Error: input VCF file has no GT format field\n");
3534
3535 model.ad_id = bcf_hdr_id2int(hdr, BCF_DT_ID, "AD");
3536 if (!bcf_hdr_idinfo_exists(hdr, BCF_HL_FMT, model.ad_id)) model.ad_id = -1;
3537 if (model.ad_id >= 0 && bcf_hdr_id2type(hdr, BCF_HL_FMT, model.ad_id) != BCF_HT_INT)
3538 error("Error: input VCF file AD format field is not of integer type\n");
3539 if (model.ad_id >= 0 && bcf_hdr_id2length(hdr, BCF_HL_FMT, model.ad_id) != BCF_VL_R)
3540 error("Error: input VCF file AD format field has wrong number of values\n");
3541
3542 model.baf_id = bcf_hdr_id2int(hdr, BCF_DT_ID, "BAF");
3543 if (!bcf_hdr_idinfo_exists(hdr, BCF_HL_FMT, model.baf_id)) model.baf_id = -1;
3544 if (model.baf_id >= 0 && bcf_hdr_id2type(hdr, BCF_HL_FMT, model.baf_id) != BCF_HT_REAL)
3545 error("Error: input VCF file BAF format field is not of float type\n");
3546
3547 model.lrr_id = bcf_hdr_id2int(hdr, BCF_DT_ID, "LRR");
3548 if (!bcf_hdr_idinfo_exists(hdr, BCF_HL_FMT, model.lrr_id)) model.lrr_id = -1;
3549 if (model.lrr_id >= 0 && bcf_hdr_id2type(hdr, BCF_HL_FMT, model.lrr_id) != BCF_HT_REAL)
3550 error("Error: input VCF file LRR format field is not of float type\n");
3551
3552 if (model.ad_id >= 0) {
3553 model.flags |= WGS_DATA;
3554 model.lrr_hap2dip = (float)M_LN2;
3555 } else if (model.baf_id >= 0 && model.lrr_id >= 0) {
3556 if (model.allele_a_id < 0 || model.allele_b_id < 0)
3557 error(
3558 "Error: input VCF file has no ALLELE_A or ALLELE_B info fields\nUse bcftools +mochatools -- -t "
3559 "ALLELE_A,ALLELE_B to infer ALLELE_A and ALLELE_B\n");
3560 } else {
3561 error("Error: input VCF file must contain either the AD format field or the BAF and LRR format fields\n");
3562 }
3563
3564 // median adjustment is necessary with sequencing counts data
3565 if ((model.lrr_gc_order < 0) && (model.flags & WGS_DATA)) model.lrr_gc_order = 0;
3566
3567 // beginning of plugin run
3568 fprintf(log_file, "MoChA " MOCHA_VERSION " https://github.com/freeseek/mocha\n");
3569 fprintf(log_file, "Genome reference: %s\n", rules);
3570 if (sample_names) fprintf(log_file, "Samples: %s\n", sample_names);
3571 if (targets_list) fprintf(log_file, "Targets: %s\n", targets_list);
3572 if (sr->apply_filters) fprintf(log_file, "Filters: %s\n", sr->apply_filters);
3573 if (model.flags & FLT_EXCLUDE) fprintf(log_file, "Variants to exclude: %s\n", filter_fname);
3574 if (model.flags & FLT_INCLUDE) fprintf(log_file, "Variants to include: %s\n", filter_fname);
3575 if (cnp_fname) fprintf(log_file, "Regions to genotype: %s\n", cnp_fname);
3576 if (stats_fname) fprintf(log_file, "Genome-wide statistics: %s\n", stats_fname);
3577 if (mhc_reg) fprintf(log_file, "MHC region to exclude from analysis: %s\n", mhc_reg);
3578 if (kir_reg) fprintf(log_file, "KIR region to exclude from analysis: %s\n", kir_reg);
3579 fprintf(log_file, "BAF deviations for LRR+BAF model: %s\n", bdev_lrr_baf);
3580 fprintf(log_file, "BAF deviations for BAF+phase model: %s\n", bdev_baf_phase);
3581 if (model.flags & WGS_DATA) {
3582 fprintf(log_file, "Minimum base pair distance between consecutive sites: %d\n", model.min_dst);
3583 } else {
3584 fprintf(log_file, "Minimum number of genotypes for a cluster to median adjust BAF and LRR: %d\n",
3585 model.adj_baf_lrr);
3586 fprintf(log_file, "Minimum number of genotypes for a cluster to regress BAF against LRR: %d\n",
3587 model.regress_baf_lrr);
3588 }
3589 fprintf(log_file, "Order of polynomial in local GC content to be used to regress LRR against GC: %d\n",
3590 model.lrr_gc_order);
3591 fprintf(log_file, "Major transition phred-scaled likelihood: %.2f\n", -10.0f * M_LOG10E * model.xy_major_log_prb);
3592 fprintf(log_file, "Minor transition phred-scaled likelihood: %.2f\n", -10.0f * M_LOG10E * model.xy_minor_log_prb);
3593 fprintf(log_file, "Autosomal telomeres phred-scaled likelihood: %.2f\n",
3594 -10.0f * M_LOG10E * model.auto_tel_log_prb);
3595 fprintf(log_file, "Chromosome X telomeres phred-scaled likelihood: %.2f\n",
3596 -10.0f * M_LOG10E * model.chrX_tel_log_prb);
3597 fprintf(log_file, "Chromosome Y telomeres phred-scaled likelihood: %.2f\n",
3598 -10.0f * M_LOG10E * model.chrY_tel_log_prb);
3599 fprintf(log_file, "Uniform error phred-scaled likelihood: %.2f\n", -10.0f * M_LOG10E * model.err_log_prb);
3600 fprintf(log_file, "Phase flip phred-scaled likelihood: %.2f\n", -10.0f * M_LOG10E * model.flip_log_prb);
3601 fprintf(log_file, "List of short arms: %s\n", short_arm_chrs);
3602 fprintf(log_file, "Use variants in short arms: %s\n", model.flags & USE_SHORT_ARMS ? "TRUE" : "FALSE");
3603 fprintf(log_file, "Use variants in centromeres: %s\n", model.flags & USE_CENTROMERES ? "TRUE" : "FALSE");
3604 fprintf(log_file, "Use chromosomes without centromere rules: %s\n",
3605 model.flags & USE_NO_RULES_CHRS ? "TRUE" : "FALSE");
3606 fprintf(log_file, "Relative contribution from LRR for LRR+BAF model: %.2f\n", model.lrr_bias);
3607 fprintf(log_file, "Difference between LRR for haploid and diploid: %.2f\n", model.lrr_hap2dip);
3608
3609 // initialize genome parameters
3610 if (rules_is_file)
3611 model.genome_rules = genome_init_file(rules, hdr);
3612 else
3613 model.genome_rules = genome_init_alias(log_file, rules, hdr);
3614 if (!(model.flags & NO_LOG)) fprintf(log_file, "Using genome assembly from %s\n", rules);
3615 readlist_short_arms(model.genome_rules, short_arm_chrs, hdr);
3616
3617 // subset VCF file
3618 if (sample_names) {
3619 int ret = bcf_hdr_set_samples(hdr, sample_names, sample_is_file);
3620 if (ret < 0)
3621 error("Error parsing the sample list\n");
3622 else if (ret > 0) {
3623 if (force_samples)
3624 fprintf(log_file, "Warn: sample #%d not found in the header... skipping\n", ret);
3625 else
3626 error(
3627 "Error: sample #%d not found in the header. Use \"--force-samples\" to "
3628 "ignore this error\n",
3629 ret);
3630 }
3631 if (bcf_hdr_nsamples(hdr) == 0) error("Error: subsetting has removed all samples\n");
3632 }
3633
3634 // output VCF
3635 if (output_fname) {
3636 out_fh = hts_open(output_fname, hts_bcf_wmode(output_type));
3637 if (out_fh == NULL) error("Cannot write to \"%s\": %s\n", output_fname, strerror(errno));
3638 if (n_threads) hts_set_opt(out_fh, HTS_OPT_THREAD_POOL, sr->p);
3639 out_hdr = print_hdr(out_fh, hdr, argc, argv, record_cmd_line, model.flags);
3640 }
3641
3642 int nsmpl = bcf_hdr_nsamples(hdr);
3643 if (nsmpl == 0) error("No samples in the VCF?\n");
3644 if (!(model.flags & NO_LOG)) fprintf(log_file, "Loading %d sample(s) from the VCF file\n", nsmpl);
3645 if (!(model.flags & WGS_DATA)) {
3646 if (model.regress_baf_lrr != -1 && (model.adj_baf_lrr == -1 || model.regress_baf_lrr < model.adj_baf_lrr))
3647 error(
3648 "Error: median adjustment must be performed whenever regression adjustment is "
3649 "performed\n--adjust-BAF-LRR %d --regress-BAF-LRR %d\n",
3650 model.adj_baf_lrr, model.regress_baf_lrr);
3651 if (nsmpl < 3 * model.adj_baf_lrr)
3652 fprintf(log_file,
3653 "Warning: --adjust-BAF-LRR %d median adjustment will not be effective with "
3654 "only %d sample(s)\n",
3655 model.adj_baf_lrr, nsmpl);
3656 if (nsmpl < 3 * model.regress_baf_lrr)
3657 fprintf(log_file,
3658 "Warning: --regress-BAF-LRR %d regression will not be effective with only "
3659 "%d sample(s)\n",
3660 model.regress_baf_lrr, nsmpl);
3661 }
3662
3663 sample = (sample_t *)calloc(nsmpl, sizeof(sample_t));
3664 for (int i = 0; i < nsmpl; i++) {
3665 sample[i].idx = i;
3666 sample[i].x_nonpar_lrr_median = NAN;
3667 sample[i].y_nonpar_lrr_median = NAN;
3668 sample[i].mt_lrr_median = NAN;
3669 }
3670 if (stats_fname ? read_stats(sample, hdr, stats_fname, model.lrr_gc_order, model.flags) : 1) {
3671 for (int rid = 0; rid < hdr->n[BCF_DT_CTG]; rid++) {
3672 model.rid = rid;
3673 int nret = get_contig(sr, sample, &model);
3674 if (model.n <= 0) continue;
3675 if (!(model.flags & NO_LOG))
3676 fprintf(log_file, "Using %d out of %d read variants from contig %s\n", model.n, nret,
3677 bcf_hdr_id2name(hdr, rid));
3678 if (model.genome_rules->length[rid] < model.locus_arr[model.n - 1].pos)
3679 model.genome_rules->length[rid] = model.locus_arr[model.n - 1].pos;
3680 for (int j = 0; j < nsmpl; j++) sample_stats(sample + j, &model);
3681 }
3682
3683 sample_summary(sample, nsmpl, &model, stats_fname == NULL);
3684 }
3685
3686 int cnt[3] = {0, 0, 0};
3687 for (int i = 0; i < nsmpl; i++) cnt[sample[i].computed_gender]++;
3688 if (!(model.flags & NO_LOG))
3689 fprintf(log_file, "Estimated %d sample(s) of unknown gender, %d male(s) and %d female(s)\n",
3690 cnt[GENDER_UNKNOWN], cnt[GENDER_MALE], cnt[GENDER_FEMALE]);
3691 mocha_print_stats(out_fz, sample, nsmpl, model.lrr_gc_order, hdr, model.flags);
3692
3693 if (!only_stats) {
3694 for (int rid = 0; rid < hdr->n[BCF_DT_CTG]; rid++) {
3695 model.rid = rid;
3696 int nret = get_contig(sr, sample, &model);
3697 if (model.n <= 0) continue;
3698 if (!(model.flags & NO_LOG))
3699 fprintf(log_file, "Using %d out of %d read variants from contig %s\n", model.n, nret,
3700 bcf_hdr_id2name(hdr, rid));
3701 for (int j = 0; j < nsmpl; j++) {
3702 if (model.cnp_idx)
3703 regidx_overlap(model.cnp_idx, bcf_hdr_id2name(hdr, rid), 0, model.genome_rules->length[rid],
3704 model.cnp_itr);
3705 sample_run(sample + j, &mocha_table, &model);
3706 }
3707
3708 if (output_fname) {
3709 nret = put_contig(sr, sample, &model, out_fh, out_hdr);
3710 if (!(model.flags & NO_LOG))
3711 fprintf(log_file, "Written %d variants for contig %s\n", nret, bcf_hdr_id2name(hdr, rid));
3712 }
3713 }
3714
3715 // estimate LRR at common autosomal losses and gains
3716 if (!(model.flags & NO_LOG) && model.cnp_itr) {
3717 int n_cnp_loss = 0, n_cnp_gain = 0, n_rare_gain = 0, n_rare_loss = 0;
3718 float *cnp_ldev = (float *)malloc(mocha_table.n * sizeof(float));
3719 float *rare_ldev = (float *)malloc(mocha_table.n * sizeof(float));
3720 for (int i = 0; i < mocha_table.n; i++) {
3721 if (mocha_table.a[i].rid == model.genome_rules->x_rid) continue;
3722 if (mocha_table.a[i].type == MOCHA_CNP_LOSS && mocha_table.a[i].n_hets == 0)
3723 cnp_ldev[n_cnp_loss++] = mocha_table.a[i].ldev;
3724 else if (mocha_table.a[i].type == MOCHA_CNP_GAIN && mocha_table.a[i].n_hets >= 5
3725 && mocha_table.a[i].bdev >= 0.1f)
3726 cnp_ldev[mocha_table.n - (++n_cnp_gain)] = mocha_table.a[i].ldev;
3727 else if (mocha_table.a[i].type == MOCHA_LOSS && mocha_table.a[i].n_hets == 0)
3728 rare_ldev[n_rare_loss++] = mocha_table.a[i].ldev;
3729 else if (mocha_table.a[i].type == MOCHA_GAIN && mocha_table.a[i].n_hets >= 5
3730 && mocha_table.a[i].bdev >= 0.1f)
3731 rare_ldev[mocha_table.n - (++n_rare_gain)] = mocha_table.a[i].ldev;
3732 }
3733 float cnp_loss_ldev = get_median(cnp_ldev, n_cnp_loss, NULL);
3734 float cnp_gain_ldev = get_median(&cnp_ldev[mocha_table.n - n_cnp_gain], n_cnp_gain, NULL);
3735 float rare_loss_ldev = get_median(rare_ldev, n_rare_loss, NULL);
3736 float rare_gain_ldev = get_median(&rare_ldev[mocha_table.n - n_rare_gain], n_rare_gain, NULL);
3737 fprintf(log_file, "Adjusted LRR at CNP losses: %.4f\n", cnp_loss_ldev);
3738 fprintf(log_file, "Adjusted LRR at CNP gains: %.4f\n", cnp_gain_ldev);
3739 fprintf(log_file, "Adjusted LRR at rare losses: %.4f\n", rare_loss_ldev);
3740 fprintf(log_file, "Adjusted LRR at rare gains: %.4f\n", rare_gain_ldev);
3741 free(cnp_ldev);
3742 free(rare_ldev);
3743 }
3744 }
3745
3746 // clear sample data
3747 for (int j = 0; j < nsmpl; j++) {
3748 free(sample[j].vcf_imap_arr);
3749 free(sample[j].data_arr[BDEV]);
3750 free(sample[j].data_arr[LDEV]);
3751 free(sample[j].phase_arr);
3752 }
3753
3754 // clear model data
3755 free(model.locus_arr);
3756 free(model.gc_arr);
3757 free(model.bdev_lrr_baf);
3758 free(model.bdev_baf_phase);
3759 genome_destroy(model.genome_rules);
3760
3761 // free precomputed tables
3762 ad_to_lrr_baf(NULL, NULL, NULL, NULL, 0);
3763
3764 // write table with mosaic chromosomal alterations (and UCSC bed track)
3765 if (!only_stats) {
3766 mocha_print_calls(out_fc, mocha_table.a, mocha_table.n, hdr, model.flags, rules, model.lrr_hap2dip);
3767 mocha_print_ucsc(out_fu, mocha_table.a, mocha_table.n, hdr);
3768 }
3769 free(mocha_table.a);
3770
3771 // close output VCF
3772 if (output_fname) {
3773 bcf_hdr_destroy(out_hdr);
3774 hts_close(out_fh);
3775 }
3776
3777 if (log_file != stdout && log_file != stderr) fclose(log_file);
3778
3779 // clean up
3780 if (model.filter) filter_destroy(model.filter);
3781 if (model.cnp_idx) regidx_destroy(model.cnp_idx);
3782 if (model.cnp_itr) regitr_destroy(model.cnp_itr);
3783 bcf_sr_destroy(sr);
3784 free(sample);
3785
3786 beta_binom_destroy(beta_binom_null);
3787 beta_binom_destroy(beta_binom_alt);
3788 return 0;
3789 }
3790