1 /* The MIT License
2 
3    Copyright (C) 2015-2021 Giulio Genovese
4 
5    Author: Giulio Genovese <giulio.genovese@gmail.com>
6 
7    Permission is hereby granted, free of charge, to any person obtaining a copy
8    of this software and associated documentation files (the "Software"), to deal
9    in the Software without restriction, including without limitation the rights
10    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11    copies of the Software, and to permit persons to whom the Software is
12    furnished to do so, subject to the following conditions:
13 
14    The above copyright notice and this permission notice shall be included in
15    all copies or substantial portions of the Software.
16 
17    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23    THE SOFTWARE.
24 
25  */
26 
27 #include <stdio.h>
28 #include <unistd.h>
29 #include <getopt.h>
30 #include <errno.h>
31 #include <ctype.h>
32 #include <math.h>
33 #include <htslib/kseq.h>
34 #include <htslib/vcf.h>
35 #include <htslib/synced_bcf_reader.h>
36 #include <htslib/ksort.h>
37 #include "regidx.h"
38 #include "kmin.h"
39 #include "mocha.h"
40 #include "genome_rules.h"
41 #include "beta_binom.h"
42 #include "bcftools.h"
43 #include "filter.h"
44 #include "tsv2vcf.h"
45 
46 #define MOCHA_VERSION "2021-10-15"
47 
48 /****************************************
49  * CONSTANT DEFINITIONS                 *
50  ****************************************/
51 
52 #define SIGN(x) (((x) > 0) - ((x) < 0))
53 
54 #define BDEV_LRR_BAF_DFLT "-2.0,-4.0,-6.0,10.0,6.0,4.0"
55 #define BDEV_BAF_PHASE_DFLT "6.0,8.0,10.0,15.0,20.0,30.0,50.0,80.0,100.0,150.0,200.0"
56 #define MIN_DST_DFLT "400"
57 #define ADJ_BAF_LRR_DFLT "5"
58 #define REGRESS_BAF_LRR_DFLT "15"
59 #define LRR_GC_ORDER_DFLT "2"
60 #define MAX_ORDER 5
61 #define XY_MAJOR_PL_DFLT "65.0"
62 #define XY_MINOR_PL_DFLT "35.0"
63 #define AUTO_TEL_PL_DFLT "20.0"
64 #define CHRX_TEL_PL_DFLT "8.0"
65 #define CHRY_TEL_PL_DFLT "6.0"
66 #define ERR_PL_DFLT "15.0"
67 #define FLIP_PL_DFLT "20.0"
68 #define SHORT_ARM_CHRS_DFLT "13,14,15,21,22,chr13,chr14,chr15,chr21,chr22"
69 #define LRR_BIAS_DFLT "0.2"
70 // https://www.illumina.com/documents/products/technotes/technote_cnv_algorithms.pdf
71 #define LRR_HAP2DIP_DFLT "0.45"
72 
73 #define FLT_INCLUDE (1 << 0)
74 #define FLT_EXCLUDE (1 << 1)
75 #define WGS_DATA (1 << 2)
76 #define NO_LOG (1 << 3)
77 #define NO_ANNOT (1 << 4)
78 #define USE_SHORT_ARMS (1 << 5)
79 #define USE_CENTROMERES (1 << 6)
80 #define USE_NO_RULES_CHRS (1 << 7)
81 
82 #define LRR 0
83 #define BAF 1
84 #define AD0 0
85 #define AD1 1
86 #define LDEV 0
87 #define BDEV 1
88 
89 #define LRR_BAF 0
90 #define BAF_PHASE 1
91 
92 #define MOCHA_UNDET 0
93 #define MOCHA_LOSS 1
94 #define MOCHA_GAIN 2
95 #define MOCHA_CNLOH 3
96 #define MOCHA_CNP_LOSS 4
97 #define MOCHA_CNP_GAIN 5
98 #define MOCHA_CNP_CNV 6
99 
100 #define MOCHA_NOT 0
101 #define MOCHA_ARM 1
102 #define MOCHA_TEL 2
103 
104 #define GT_NC 0
105 #define GT_AA 1
106 #define GT_AB 2
107 #define GT_BB 3
108 
109 /****************************************
110  * DATA STRUCTURES                      *
111  ****************************************/
112 
113 typedef struct {
114     int pos;
115     int allele_a;
116     int allele_b;
117     float adjust[3][3]; // shift
118 } locus_t;
119 
120 typedef struct {
121     int allele_a_id, allele_b_id, gc_id, gt_id, ad_id, baf_id, lrr_id;
122     float xy_major_log_prb;
123     float xy_minor_log_prb;
124     float auto_tel_log_prb;
125     float chrX_tel_log_prb;
126     float chrY_tel_log_prb;
127     float err_log_prb;
128     float flip_log_prb;
129     float *bdev_lrr_baf, *bdev_baf_phase;
130     int bdev_lrr_baf_n, bdev_baf_phase_n;
131     int min_dst;
132     float lrr_cutoff;
133     float lrr_hap2dip;
134     float lrr_bias;
135     int adj_baf_lrr;
136     int regress_baf_lrr;
137     int lrr_gc_order;
138     int flags;
139     int filter_logic;
140     filter_t *filter;
141     genome_rules_t *genome_rules;
142     regidx_t *cnp_idx;
143     regitr_t *cnp_itr;
144     regidx_t *mhc_idx;
145     regidx_t *kir_idx;
146 
147     int rid;
148     int n;
149     locus_t *locus_arr;
150     int m_locus;
151     float *gc_arr;
152     int m_gc;
153     int n_flipped;
154 } model_t;
155 
156 typedef struct {
157     int sample_idx;
158     int computed_gender;
159     int rid;
160     int beg_pos;
161     int end_pos;
162     int length;
163     int8_t p_arm;
164     int8_t q_arm;
165     int n_sites;
166     int n_hets;
167     int n50_hets;
168     float bdev;
169     float bdev_se;
170     float ldev;
171     float ldev_se;
172     float lod_lrr_baf;
173     float lod_baf_phase;
174     int n_flips;
175     float baf_conc;
176     float lod_baf_conc;
177     int8_t type;
178     float cf;
179 } mocha_t;
180 
181 typedef struct {
182     int n, m;
183     mocha_t *a;
184 } mocha_table_t;
185 
186 typedef struct {
187     float call_rate;
188     float lrr_median;
189     float lrr_sd;
190     float lrr_auto;
191     float dispersion; // either rho(AD0, AD1) for WGS model or sd(BAF)
192     float baf_conc;
193     float baf_auto;
194     float lrr_gc_rel_ess;
195     float coeffs[MAX_ORDER + 1];
196 } stats_t;
197 
198 typedef struct {
199     int idx;
200     int computed_gender;
201     float adjlrr_sd;
202     int n_sites;
203     int n_missing_gts;
204     int n_hets;
205     int x_nonpar_n_hets;
206     int par1_n_hets;
207     int xtr_n_hets;
208     int par2_n_hets;
209     float x_nonpar_dispersion; // either rho(AD0, AD1) for WGS model or sd(BAF)
210     float x_nonpar_lrr_median;
211     float y_nonpar_lrr_median;
212     float mt_lrr_median;
213     stats_t stats;
214     stats_t *stats_arr;
215     int m_stats, n_stats;
216 
217     int n;
218     int *vcf_imap_arr;
219     int m_vcf_imap;
220     int16_t *data_arr[2];
221     int m_data[2];
222     int8_t *phase_arr;
223     int m_phase;
224 } sample_t;
225 
226 /****************************************
227  * INLINE FUNCTIONS AND CONSTANTS       *
228  ****************************************/
229 
230 // this macro from ksort.h defines the function
231 // void ks_introsort_int(size_t n, int a[]);
KSORT_INIT_GENERIC(int)232 KSORT_INIT_GENERIC(int)
233 
234 static inline float sqf(float x) { return x * x; }
sq(double x)235 static inline double sq(double x) { return x * x; }
236 // the x == y is necessary in case x == -INFINITY
log_mean_expf(float x,float y)237 static inline float log_mean_expf(float x, float y) {
238     return x == y ? x : (x > y ? x + logf(1.0f + expf(y - x)) : y + logf(1.0f + expf(x - y))) - (float)M_LN2;
239 }
240 
241 static beta_binom_t *beta_binom_null, *beta_binom_alt;
beta_binom_log_lkl(const beta_binom_t * self,int16_t ad0,int16_t ad1)242 static inline float beta_binom_log_lkl(const beta_binom_t *self, int16_t ad0, int16_t ad1) {
243     return ad0 == bcf_int16_missing || ad1 == bcf_int16_missing ? 0.0f : beta_binom_log_unsafe(self, ad0, ad1);
244 }
245 
246 /****************************************
247  * CONVERT FLOAT TO INT16 AND VICEVERSA *
248  ****************************************/
249 
250 #define INT16_SCALE 1000 // BAF values from Illumina are scaled to 1000
251 
float_to_int16(float in)252 static inline int16_t float_to_int16(float in) {
253     return isnan(in) ? bcf_int16_missing : (int16_t)roundf(INT16_SCALE * in);
254 }
255 
int16_to_float(int16_t in)256 static inline float int16_to_float(int16_t in) { return in == bcf_int16_missing ? NAN : ((float)in) / INT16_SCALE; }
257 
258 /******************************************
259  * LRR AND COVERAGE POLYNOMIAL REGRESSION *
260  ******************************************/
261 
262 // https://stackoverflow.com/questions/5083465/fast-efficient-least-squares-fit-algorithm-in-c
263 // https://github.com/natedomin/polyfit/blob/master/polyfit.c
264 // https://software.intel.com/sites/products/documentation/doclib/mkl_sa/11/mkl_lapack_examples/sgels_ex.c.htm
265 // this function needs to use doubles internally when dealing with WGS data
polyfit(const float * lrr,const float * gc,int n,const int * imap,int order,float * coeffs)266 static int polyfit(const float *lrr, const float *gc, int n, const int *imap, int order, float *coeffs) {
267     int m = order + 1;
268     if (n < m || order > MAX_ORDER) return -1;
269     double B[MAX_ORDER + 1] = {0.0};
270     double P[((MAX_ORDER + 1) * 2) + 1] = {0.0};
271     double A[(MAX_ORDER + 1) * 2 * (MAX_ORDER + 1)] = {0.0};
272 
273     // identify the column vector
274     for (int i = 0; i < n; i++) {
275         float x = imap ? gc[imap[i]] : gc[i];
276         float y = lrr[i];
277         if (isnan(x) || isnan(y)) continue;
278         float powx = 1.0f;
279 
280         for (int j = 0; j < m; j++) {
281             B[j] += (double)(y * powx);
282             powx *= x;
283         }
284     }
285 
286     // initialize the PowX array
287     P[0] = (float)n;
288 
289     // compute the sum of the powers of X
290     for (int i = 0; i < n; i++) {
291         float x = imap ? gc[imap[i]] : gc[i];
292         if (isnan(x)) continue;
293         float powx = x;
294 
295         for (int j = 1; j < ((2 * m) + 1); j++) {
296             P[j] += (double)powx;
297             powx *= x;
298         }
299     }
300 
301     // initialize the reduction matrix
302     for (int i = 0; i < m; i++) {
303         for (int j = 0; j < m; j++) {
304             A[(i * (2 * m)) + j] = P[i + j];
305         }
306 
307         A[(i * (2 * m)) + (i + m)] = 1.0;
308     }
309 
310     // move the identity matrix portion of the redux matrix to the
311     // left side (find the inverse of the left side of the redux matrix)
312     for (int i = 0; i < m; i++) {
313         double x = A[(i * (2 * m)) + i];
314         if (x != 0) {
315             for (int k = 0; k < (2 * m); k++) {
316                 A[(i * (2 * m)) + k] /= x;
317             }
318 
319             for (int j = 0; j < m; j++) {
320                 if (i != j) {
321                     double y = A[(j * (2 * m)) + i];
322                     for (int k = 0; k < (2 * m); k++) {
323                         A[(j * (2 * m)) + k] -= y * A[(i * (2 * m)) + k];
324                     }
325                 }
326             }
327         } else {
328             // cannot work with singular matrices
329             return -1;
330         }
331     }
332 
333     // calculate coefficients
334     for (int i = 0; i < m; i++) {
335         for (int j = 0; j < m; j++) {
336             double x = 0.0;
337             for (int k = 0; k < m; k++) {
338                 x += (A[(i * (2 * m)) + (k + m)] * B[k]);
339             }
340             coeffs[i] = (float)x;
341         }
342     }
343 
344     return 0;
345 }
346 
ad_to_lrr_baf(const int16_t * ad0,const int16_t * ad1,float * lrr,float * baf,int n)347 static void ad_to_lrr_baf(const int16_t *ad0, const int16_t *ad1, float *lrr, float *baf, int n) {
348     // this function keeps a list of logarithms of integers to minimize log calls
349     static float *logf_arr = NULL;
350     static int n_logf = 0, m_logf = 0;
351     if (ad0 == NULL && ad1 == NULL && n == 0) {
352         free(logf_arr);
353         return;
354     }
355 
356     for (int i = 0; i < n; i++) {
357         if (ad0[i] == bcf_int16_missing && ad1[i] == bcf_int16_missing) {
358             lrr[i] = NAN;
359             baf[i] = NAN;
360             continue;
361         }
362         int cov = (int)(ad0[i] == bcf_int16_missing ? 0 : ad0[i]) + (int)(ad1[i] == bcf_int16_missing ? 0 : ad1[i]);
363         if (cov == 0) {
364             lrr[i] = 0;
365             baf[i] = NAN;
366         } else {
367             if (cov > n_logf) {
368                 hts_expand(float, cov, m_logf, logf_arr);
369                 for (int j = n_logf; j < cov; j++) logf_arr[j] = logf(j + 1);
370                 n_logf = cov;
371             }
372             lrr[i] = logf_arr[cov - 1];
373             baf[i] = (ad0[i] == bcf_int16_missing || ad1[i] == bcf_int16_missing) ? NAN : (float)ad1[i] / (float)cov;
374         }
375     }
376 }
377 
adjust_lrr(float * lrr,const float * gc,int n,const int * imap,const float * coeffs,int order)378 static void adjust_lrr(float *lrr, const float *gc, int n, const int *imap, const float *coeffs, int order) {
379     for (int i = 0; i < n; i++) {
380         float x = imap ? gc[imap[i]] : gc[i];
381         float powx = 1.0f;
382         for (int j = 0; j <= order; j++) {
383             lrr[i] -= coeffs[j] * powx;
384             powx *= x;
385         }
386     }
387 }
388 
389 // computes total sum of squares
390 // this function needs to use doubles internally when dealing with WGS data
get_tss(const float * v,int n)391 static float get_tss(const float *v, int n) {
392     double mean = 0.0;
393     int j = 0;
394     for (int i = 0; i < n; i++) {
395         if (!isnan(v[i])) {
396             mean += (double)v[i];
397             j++;
398         }
399     }
400     if (j <= 1) return NAN;
401     mean /= (double)j;
402 
403     double tss = 0.0;
404     for (int i = 0; i < n; i++) {
405         if (!isnan(v[i])) tss += sq((double)v[i] - mean);
406     }
407     return (float)tss;
408 }
409 
410 /*********************************
411  * HMM AND OPTIMIZATION METHODS  *
412  *********************************/
413 
414 // compute Viterbi path from probabilities
retrace_viterbi(int T,int N,const float * log_prb,const int8_t * ptr)415 static int8_t *retrace_viterbi(int T, int N, const float *log_prb, const int8_t *ptr) {
416     int i, t;
417     int8_t *path = (int8_t *)malloc(T * sizeof(int8_t));
418 
419     // initialize last path state
420     path[T - 1] = 0;
421     for (i = 1; i < N; i++)
422         if (log_prb[(int)path[T - 1]] < log_prb[i]) path[T - 1] = (int8_t)i;
423 
424     // compute best path by tracing back the Markov chain
425     for (t = T - 1; t > 0; t--) path[t - 1] = ptr[(t - 1) * N + (int)path[t]];
426 
427     return path;
428 }
429 
430 // rescale Viterbi log probabilities to avoid underflow issues
rescale_log_prb(float * log_prb,int n)431 static inline void rescale_log_prb(float *log_prb, int n) {
432     float max = -INFINITY;
433     for (int i = 0; i < n; i++) max = max > log_prb[i] ? max : log_prb[i];
434     for (int i = 0; i < n; i++) log_prb[i] -= max;
435 }
436 
437 // compute the Viterbi path from BAF
438 // n is the length of the hidden Markov model
439 // m is the number of possible BAF deviations
log_viterbi_run(const float * emis_log_lkl,int T,int m,float xy_major_log_prb,float xy_minor_log_prb,float tel_log_prb,float flip_log_prb,int last_p,int first_q)440 static int8_t *log_viterbi_run(const float *emis_log_lkl, int T, int m, float xy_major_log_prb, float xy_minor_log_prb,
441                                float tel_log_prb, float flip_log_prb, int last_p, int first_q) {
442     int t, i, j, changeidx;
443 
444     // determine the number of hidden states based on whether phase information is used
445     int N = 1 + m + (isnan(flip_log_prb) ? 0 : m);
446 
447     // allocate memory necessary for running the algorithm
448     float *log_prb = (float *)malloc(N * sizeof(float));
449     float *new_log_prb = (float *)malloc(N * sizeof(float));
450     int8_t *ptr = (int8_t *)malloc(N * (T - 1) * sizeof(int8_t));
451     int8_t *path;
452 
453     // initialize and rescale the first state
454     log_prb[0] = emis_log_lkl[0];
455     for (i = 1; i < N; i++) log_prb[i] = tel_log_prb + emis_log_lkl[i];
456     rescale_log_prb(log_prb, N);
457 
458     // compute best probabilities at each position
459     for (t = 1; t < T; t++) {
460         // this causes a penalty for mosaic chromosomal calls across the centromeres
461         float exit_log_prb = t > last_p ? xy_major_log_prb : xy_minor_log_prb;
462         float enter_log_prb = t < first_q ? xy_major_log_prb : xy_minor_log_prb;
463 
464         for (i = 0; i < N; i++) {
465             new_log_prb[i] = log_prb[i];
466             ptr[(t - 1) * N + i] = (int8_t)i;
467         }
468 
469         // compute whether a state switch should be considered for null state
470         for (i = 1; i < N; i++) {
471             if (new_log_prb[0] < log_prb[i] + exit_log_prb) {
472                 new_log_prb[0] = log_prb[i] + exit_log_prb;
473                 ptr[(t - 1) * N] = ptr[(t - 1) * N + i];
474             }
475             if (new_log_prb[i] < log_prb[0] + enter_log_prb) {
476                 new_log_prb[i] = log_prb[0] + enter_log_prb;
477                 ptr[(t - 1) * N + i] = ptr[(t - 1) * N];
478             }
479         }
480 
481         // compute whether a state switch should be considered for each other state
482         // it will run twice if and only if phasing is used
483         for (j = 0; j == 0 || (!isnan(flip_log_prb) && j == m); j += m) {
484             float change_log_prb = log_prb[0] + enter_log_prb;
485             changeidx = 0;
486             for (i = 0; i < m; i++) {
487                 if (change_log_prb < log_prb[1 + j + i] + (xy_major_log_prb + xy_minor_log_prb) * 0.75f) {
488                     change_log_prb = log_prb[1 + j + i] + (xy_major_log_prb + xy_minor_log_prb) * 0.75f;
489                     changeidx = 1 + j + i;
490                 }
491             }
492             for (i = 0; i < m; i++) {
493                 if (new_log_prb[1 + j + i] < change_log_prb) {
494                     new_log_prb[1 + j + i] = change_log_prb;
495                     ptr[(t - 1) * N + 1 + j + i] = ptr[(t - 1) * N + changeidx];
496                 }
497             }
498         }
499 
500         // compute whether a phase flip should be considered for non-null states
501         if (!isnan(flip_log_prb)) {
502             for (i = 0; i < m; i++) {
503                 if (new_log_prb[1 + i] < new_log_prb[1 + m + i] + flip_log_prb) {
504                     new_log_prb[1 + i] = new_log_prb[1 + m + i] + flip_log_prb;
505                     ptr[(t - 1) * N + 1 + i] = ptr[(t - 1) * N + 1 + m + i];
506                 }
507                 if (new_log_prb[1 + m + i] < new_log_prb[1 + i] + flip_log_prb) {
508                     new_log_prb[1 + m + i] = new_log_prb[1 + i] + flip_log_prb;
509                     ptr[(t - 1) * N + 1 + m + i] = ptr[(t - 1) * N + 1 + i];
510                 }
511             }
512         }
513 
514         // update and rescale the current state
515         new_log_prb[0] += emis_log_lkl[t * N];
516         for (i = 0; i < m; i++) {
517             new_log_prb[1 + i] += emis_log_lkl[t * N + 1 + i];
518             if (!isnan(flip_log_prb)) new_log_prb[1 + m + i] += emis_log_lkl[t * N + 1 + m + i];
519         }
520         for (i = 0; i < N; i++) log_prb[i] = new_log_prb[i];
521         rescale_log_prb(log_prb, N);
522     }
523 
524     // add closing cost to the last state
525     for (i = 1; i < N; i++) log_prb[i] += tel_log_prb;
526     rescale_log_prb(log_prb, N);
527 
528     path = retrace_viterbi(T, N, log_prb, ptr);
529 
530     // free memory
531     free(log_prb);
532     free(new_log_prb);
533     free(ptr);
534 
535     // symmetrize the path
536     if (!isnan(flip_log_prb))
537         for (i = 0; i < T; i++)
538             if (path[i] > m) path[i] = (int8_t)m - path[i];
539 
540     return path;
541 }
542 
543 /*********************************
544  * LRR AND BAF LIKELIHOODS       *
545  *********************************/
546 
547 // rescale emission probabilities to avoid problems with outliers
548 // TODO find a better name for this function
rescale_emis_log_lkl(float * log_prb,int n,float err_log_prb)549 static void rescale_emis_log_lkl(float *log_prb, int n, float err_log_prb) {
550     float min_thr = -INFINITY;
551     for (int i = 0; i < n; i++)
552         if (min_thr < log_prb[i]) min_thr = log_prb[i];
553     min_thr += err_log_prb;
554     for (int i = 0; i < n; i++)
555         if (log_prb[i] < min_thr) log_prb[i] = min_thr;
556 }
557 
norm_log_lkl(float x,float m,float s,float w)558 static inline float norm_log_lkl(float x, float m, float s, float w) {
559     return isnan(x) ? 0.0f : -0.5f * sqf((x - m) / s) * w;
560 }
561 
562 // lrr_bias is used in a different way from what done by Petr Danecek in bcftools/vcfcnv.c
lrr_baf_log_lkl(float lrr,float baf,float ldev,float bdev,float lrr_sd,float baf_sd,float lrr_bias)563 static inline float lrr_baf_log_lkl(float lrr, float baf, float ldev, float bdev, float lrr_sd, float baf_sd,
564                                     float lrr_bias) {
565     return norm_log_lkl(lrr, ldev, lrr_sd, lrr_bias)
566            + log_mean_expf(norm_log_lkl(baf - 0.5f, bdev, baf_sd, 1.0f), norm_log_lkl(baf - 0.5f, -bdev, baf_sd, 1.0f));
567 }
568 
baf_phase_log_lkl(float baf,int8_t phase,float bdev,float baf_sd)569 static inline float baf_phase_log_lkl(float baf, int8_t phase, float bdev, float baf_sd) {
570     return phase == 0 ? log_mean_expf(norm_log_lkl(baf - 0.5f, bdev, baf_sd, 1.0f),
571                                       norm_log_lkl(baf - 0.5f, -bdev, baf_sd, 1.0f))
572                       : norm_log_lkl(baf - 0.5f, (float)SIGN(phase) * bdev, baf_sd, 1.0f);
573 }
574 
575 // precomupute emission probabilities
lrr_baf_emis_log_lkl(const float * lrr,const float * baf,int T,const int * imap,float err_log_prb,float lrr_bias,float lrr_hap2dip,float lrr_sd,float baf_sd,const float * bdev_lrr_baf_arr,int m)576 static float *lrr_baf_emis_log_lkl(const float *lrr, const float *baf, int T, const int *imap, float err_log_prb,
577                                    float lrr_bias, float lrr_hap2dip, float lrr_sd, float baf_sd,
578                                    const float *bdev_lrr_baf_arr, int m) {
579     float *ldev = (float *)malloc(m * sizeof(float));
580     for (int i = 0; i < m; i++) ldev[i] = -logf(1.0f - 2.0f * bdev_lrr_baf_arr[i]) / (float)M_LN2 * lrr_hap2dip;
581     int N = 1 + 2 * m;
582     float *emis_log_lkl = (float *)malloc(N * T * sizeof(float));
583     for (int t = 0; t < T; t++) {
584         float x = imap ? lrr[imap[t]] : lrr[t];
585         float y = imap ? baf[imap[t]] : baf[t];
586         emis_log_lkl[t * N] = lrr_baf_log_lkl(x, y, 0.0f, 0.0f, lrr_sd, baf_sd, lrr_bias);
587         for (int i = 0; i < m; i++) {
588             emis_log_lkl[t * N + 1 + i] = lrr_baf_log_lkl(x, y, ldev[i], bdev_lrr_baf_arr[i], lrr_sd, baf_sd, lrr_bias);
589         }
590         // add states to distinguish LRR waves from true mosaic gains/losses
591         for (int i = 0; i < m; i++) {
592             if (bdev_lrr_baf_arr[i] < -1.0f / 6.0f || bdev_lrr_baf_arr[i] >= 1.0f / 6.0f)
593                 emis_log_lkl[t * N + 1 + m + i] = emis_log_lkl[t * N] + err_log_prb;
594             else
595                 emis_log_lkl[t * N + 1 + m + i] =
596                     lrr_baf_log_lkl(x, y, bdev_lrr_baf_arr[i], 0.0f, lrr_sd, baf_sd, lrr_bias);
597         }
598         rescale_emis_log_lkl(&emis_log_lkl[t * N], N, err_log_prb);
599     }
600     free(ldev);
601     return emis_log_lkl;
602 }
603 
604 // precomupute emission probabilities
baf_phase_emis_log_lkl(const float * baf,const int8_t * gt_phase,int T,const int * imap,float err_log_prb,float baf_sd,const float * bdev,int m)605 static float *baf_phase_emis_log_lkl(const float *baf, const int8_t *gt_phase, int T, const int *imap,
606                                      float err_log_prb, float baf_sd, const float *bdev, int m) {
607     int N = 1 + 2 * m;
608     float *emis_log_lkl = (float *)malloc(N * T * sizeof(float));
609     for (int t = 0; t < T; t++) {
610         float x = imap ? baf[imap[t]] : baf[t];
611         int8_t p = imap ? gt_phase[imap[t]] : gt_phase[t];
612         emis_log_lkl[t * N] = baf_phase_log_lkl(x, (int8_t)1, 0.0f, baf_sd);
613         for (int i = 0; i < m; i++) {
614             emis_log_lkl[t * N + 1 + i] = baf_phase_log_lkl(x, p, bdev[i], baf_sd);
615             if (p == 0)
616                 emis_log_lkl[t * N + 1 + m + i] = emis_log_lkl[t * N + 1 + i];
617             else
618                 emis_log_lkl[t * N + 1 + m + i] = baf_phase_log_lkl(x, p, -bdev[i], baf_sd);
619         }
620         rescale_emis_log_lkl(&emis_log_lkl[t * N], N, err_log_prb);
621     }
622     return emis_log_lkl;
623 }
624 
cnp_edge_is_not_cn2_lrr_baf(const float * lrr,const float * baf,int n,int a,int b,float xy_log_prb,float err_log_prb,float lrr_bias,float lrr_hap2dip,float lrr_sd,float baf_sd,float ldev,float bdev)625 static int cnp_edge_is_not_cn2_lrr_baf(const float *lrr, const float *baf, int n, int a, int b, float xy_log_prb,
626                                        float err_log_prb, float lrr_bias, float lrr_hap2dip, float lrr_sd, float baf_sd,
627                                        float ldev, float bdev) {
628     // test left edge
629     float sum_log_lkl = 0.0f;
630     for (int i = a - 1; i >= 0; i--) {
631         float log_lkl = lrr_baf_log_lkl(lrr[i], baf[i], ldev, bdev, lrr_sd, baf_sd, lrr_bias)
632                         - lrr_baf_log_lkl(lrr[i], baf[i], 0.0f, 0.0f, lrr_sd, baf_sd, lrr_bias);
633         if (!isnan(err_log_prb)) {
634             if (log_lkl < err_log_prb)
635                 log_lkl = err_log_prb;
636             else if (log_lkl > -err_log_prb)
637                 log_lkl = -err_log_prb;
638         }
639         sum_log_lkl += log_lkl;
640         if (sum_log_lkl > -xy_log_prb) return -1;
641         if (sum_log_lkl < xy_log_prb) break;
642     }
643 
644     // test right edge
645     sum_log_lkl = 0.0f;
646     for (int i = b + 1; i < n; i++) {
647         float log_lkl = lrr_baf_log_lkl(lrr[i], baf[i], ldev, bdev, lrr_sd, baf_sd, lrr_bias)
648                         - lrr_baf_log_lkl(lrr[i], baf[i], 0, 0, lrr_sd, baf_sd, lrr_bias);
649         if (!isnan(err_log_prb)) {
650             if (log_lkl < err_log_prb)
651                 log_lkl = err_log_prb;
652             else if (log_lkl > -err_log_prb)
653                 log_lkl = -err_log_prb;
654         }
655         sum_log_lkl += log_lkl;
656         if (sum_log_lkl > -xy_log_prb) return -1;
657         if (sum_log_lkl < xy_log_prb) break;
658     }
659 
660     return 0;
661 }
662 
663 // return the LOD likelihood for a segment
lrr_baf_lod(const float * lrr_arr,const float * baf_arr,int n,const int * imap,float err_log_prb,float lrr_bias,float lrr_hap2dip,float lrr_sd,float baf_sd,double bdev_lrr_baf)664 static double lrr_baf_lod(const float *lrr_arr, const float *baf_arr, int n, const int *imap, float err_log_prb,
665                           float lrr_bias, float lrr_hap2dip, float lrr_sd, float baf_sd, double bdev_lrr_baf) {
666     if (n == 0 || bdev_lrr_baf < -0.5 || bdev_lrr_baf > 0.25) return -INFINITY; // kmin_brent does not handle NAN
667 
668     float ldev = -logf(1.0f - 2.0f * (float)bdev_lrr_baf) / (float)M_LN2 * lrr_hap2dip;
669     float ret = 0.0f;
670     for (int i = 0; i < n; i++) {
671         float lrr = imap ? lrr_arr[imap[i]] : lrr_arr[i];
672         float baf = imap ? baf_arr[imap[i]] : baf_arr[i];
673         float log_lkl = lrr_baf_log_lkl(lrr, baf, ldev, (float)bdev_lrr_baf, lrr_sd, baf_sd, lrr_bias)
674                         - lrr_baf_log_lkl(lrr, baf, 0.0f, 0.0f, lrr_sd, baf_sd, lrr_bias);
675         if (!isnan(err_log_prb)) {
676             if (log_lkl < err_log_prb)
677                 log_lkl = err_log_prb;
678             else if (log_lkl > -err_log_prb)
679                 log_lkl = -err_log_prb;
680         }
681         ret += log_lkl;
682     }
683     return (double)ret * M_LOG10E;
684 }
685 
686 // return the LOD likelihood for a segment
baf_phase_lod(const float * baf_arr,const int8_t * gt_phase,int n,const int * imap,const int8_t * as,float err_log_prb,float baf_sd,double bdev)687 static double baf_phase_lod(const float *baf_arr, const int8_t *gt_phase, int n, const int *imap, const int8_t *as,
688                             float err_log_prb, float baf_sd, double bdev) {
689     if (n == 0 || bdev < 0.0 || bdev > 0.5) return -INFINITY; // kmin_brent does not handle NAN
690 
691     float ret = 0.0f;
692     for (int i = 0; i < n; i++) {
693         float baf = imap ? baf_arr[imap[i]] : baf_arr[i];
694         int8_t p = imap ? gt_phase[imap[i]] : gt_phase[i];
695         if (as) p *= (int8_t)SIGN(as[i]); // notice as has no imap
696         float log_lkl = baf_phase_log_lkl(baf, p, (float)bdev, baf_sd) - baf_phase_log_lkl(baf, 0, 0.0f, baf_sd);
697         if (!isnan(err_log_prb)) {
698             if (log_lkl < err_log_prb)
699                 log_lkl = err_log_prb;
700             else if (log_lkl > -err_log_prb)
701                 log_lkl = -err_log_prb;
702         }
703         ret += log_lkl;
704     }
705     return (double)ret * M_LOG10E;
706 }
707 
708 typedef struct
709 {
710     const float *baf;
711     const int8_t *gt_phase;
712     int n;
713     const int *imap;
714     int8_t *path;
715     float err_log_prb;
716     float baf_sd;
717 }   f1_data_t;
718 
f1(double x,void * data)719 double f1(double x, void *data)
720 {
721     f1_data_t	*d = data;
722 
723     return -baf_phase_lod(d->baf, d->gt_phase, d->n, d->imap, d->path,
724                           d->err_log_prb, d->baf_sd, x);
725 }
726 
f1_pack(const float * baf,const int8_t * gt_phase,int n,const int * imap,int8_t * path,float err_log_prb,float baf_sd)727 static inline f1_data_t *f1_pack(const float *baf, const int8_t *gt_phase, int n,
728 		   const int *imap, int8_t *path, float err_log_prb,
729 		   float baf_sd)
730 {
731     // Switch to malloc() and free() if more than one object must exist at
732     // any given time
733     f1_data_t	*d = malloc(sizeof(f1_data_t));
734 
735     d->baf = baf;
736     d->gt_phase = gt_phase;
737     d->n = n;
738     d->imap = imap;
739     d->path = path;
740     d->err_log_prb = err_log_prb;
741     d->baf_sd = baf_sd;
742 
743     return d;
744 }
745 
746 // TODO find a better title for this function
compare_models(const float * baf,const int8_t * gt_phase,int n,const int * imap,float xy_log_prb,float err_log_prb,float flip_log_prb,float tel_log_prb,float baf_sd,const float * bdev,int m)747 static float compare_models(const float *baf, const int8_t *gt_phase, int n, const int *imap, float xy_log_prb,
748                             float err_log_prb, float flip_log_prb, float tel_log_prb, float baf_sd, const float *bdev,
749                             int m) {
750     if (n == 0) return NAN;
751     float *emis_log_lkl = baf_phase_emis_log_lkl(baf, gt_phase, n, imap, err_log_prb, baf_sd, bdev, m);
752     int8_t *path = log_viterbi_run(emis_log_lkl, n, m, xy_log_prb, xy_log_prb, tel_log_prb, flip_log_prb, 0, 0);
753     free(emis_log_lkl);
754     int n_flips = 0;
755     for (int i = 1; i < n; i++)
756         if (path[i - 1] && path[i] && path[i - 1] != path[i]) n_flips++;
757     double x;
758     f1_data_t *f1_data = f1_pack(baf, gt_phase, n, imap, path, err_log_prb,
759 				 baf_sd);
760     double fx = kmin_brent(f1, 0.1, 0.2, f1_data, KMIN_EPS, &x);
761     free(f1_data);
762     free(path);
763     return -(float)fx + (float)n_flips * flip_log_prb * (float)M_LOG10E;
764 }
765 
baf_log_lkl(const float * baf_arr,int n,const int * imap,float baf_sd,double bdev)766 static double baf_log_lkl(const float *baf_arr, int n, const int *imap, float baf_sd, double bdev) {
767     if (n == 0 || bdev < 0.0 || bdev > 0.5) return -INFINITY; // kmin_brent does not handle NAN
768 
769     double ret = 0.0;
770     for (int i = 0; i < n; i++) {
771         float baf = imap ? baf_arr[imap[i]] : baf_arr[i];
772         if (isnan(baf)) continue;
773         float log_lkl = log_mean_expf(norm_log_lkl(baf - 0.5f, (float)bdev, baf_sd, 1.0f),
774                                       norm_log_lkl(baf - 0.5f, -(float)bdev, baf_sd, 1.0f));
775         ret += (double)log_lkl;
776     }
777     return ret * M_LOG10E;
778 }
779 
get_sample_mean(const float * v,int n,const int * imap)780 static float get_sample_mean(const float *v, int n, const int *imap) {
781     float mean = 0.0;
782     int j = 0;
783     for (int i = 0; i < n; i++) {
784         float tmp = imap ? v[imap[i]] : v[i];
785         if (!isnan(tmp)) {
786             mean += tmp;
787             j++;
788         }
789     }
790     if (j <= 1) return NAN;
791     return mean /= (float)j;
792 }
793 
794 typedef struct
795 {
796     const float *baf_arr;
797     int n;
798     const int *imap;
799     float baf_sd;
800 }   f7_data_t;
801 
f7(double x,void * data)802 double f7(double x, void *data)
803 {
804     f7_data_t *d = data;
805 
806     return -baf_log_lkl(d->baf_arr, d->n, d->imap, d->baf_sd, x);
807 }
808 
f7_pack(const float * baf_arr,int n,const int * imap,float baf_sd)809 static inline f7_data_t *f7_pack(const float *baf_arr, int n, const int *imap,
810     float baf_sd)
811 {
812     f7_data_t *d = malloc(sizeof(f7_data_t));
813 
814     d->baf_arr = baf_arr;
815     d->n = n;
816     d->imap = imap;
817     d->baf_sd = baf_sd;
818 
819     return d;
820 }
821 
get_baf_bdev(const float * baf_arr,int n,const int * imap,float baf_sd)822 static float get_baf_bdev(const float *baf_arr, int n, const int *imap, float baf_sd) {
823     double bdev = 0.0;
824     int j = 0;
825     for (int i = 0; i < n; i++) {
826         float baf = imap ? baf_arr[imap[i]] : baf_arr[i];
827         if (isnan(baf)) continue;
828         bdev += fabs((double)baf - 0.5);
829         j++;
830     }
831     if (j == 0) return NAN;
832     bdev /= j;
833     // simple method to compute bdev should work well for germline duplications
834     if ((float)bdev > 2.0f * baf_sd) return (float)bdev;
835     f7_data_t *f7_data = f7_pack(baf_arr, n, imap, baf_sd);
836     kmin_brent(f7, 0.1, 0.2, f7_data, KMIN_EPS, &bdev);
837     free(f7_data);
838     return (float)bdev < 1e-4 ? (float)NAN : (float)bdev;
839 }
840 
get_max_sum(const int16_t * ad0,const int16_t * ad1,int n,const int * imap,int * n1,int * n2)841 static void get_max_sum(const int16_t *ad0, const int16_t *ad1, int n, const int *imap, int *n1, int *n2) {
842     *n1 = 0;
843     *n2 = 0;
844     for (int i = 0; i < n; i++) {
845         int a = imap ? ad0[imap[i]] : ad0[i];
846         int b = imap ? ad1[imap[i]] : ad1[i];
847         if (a != bcf_int16_missing && b != bcf_int16_missing) {
848             if (a > *n1) *n1 = a;
849             if (b > *n1) *n1 = b;
850             if (a + b > *n2) *n2 = a + b;
851         }
852     }
853 }
854 
ad_log_lkl(const int16_t * ad0_arr,const int16_t * ad1_arr,int n,const int * imap,float ad_rho,double bdev)855 static double ad_log_lkl(const int16_t *ad0_arr, const int16_t *ad1_arr, int n, const int *imap, float ad_rho,
856                          double bdev) {
857     if (n == 0 || bdev < 0.0 || bdev > 0.5) return -INFINITY; // kmin_brent does not handle NAN
858 
859     int n1, n2;
860     get_max_sum(ad0_arr, ad1_arr, n, imap, &n1, &n2);
861     beta_binom_update(beta_binom_alt, 0.5f + (float)bdev, ad_rho, n1, n2);
862 
863     double ret = 0.0;
864     for (int i = 0; i < n; i++) {
865         int16_t ad0 = imap ? ad0_arr[imap[i]] : ad0_arr[i];
866         int16_t ad1 = imap ? ad1_arr[imap[i]] : ad1_arr[i];
867         float log_lkl =
868             log_mean_expf(beta_binom_log_lkl(beta_binom_alt, ad0, ad1), beta_binom_log_lkl(beta_binom_alt, ad1, ad0));
869         ret += (double)log_lkl;
870     }
871     return (double)ret * M_LOG10E;
872 }
873 
874 typedef struct
875 {
876     const int16_t *ad0_arr;
877     const int16_t *ad1_arr;
878     int n;
879     const int *imap;
880     float ad_rho;
881 }   f5_data_t;
882 
f5(double x,void * data)883 double f5(double x, void *data)
884 {
885     f5_data_t *d = data;
886 
887     return -ad_log_lkl(d->ad0_arr, d->ad1_arr, d->n, d->imap, d->ad_rho, x);
888 }
889 
f5_pack(const int16_t * ad0_arr,const int16_t * ad1_arr,int n,const int * imap,float ad_rho)890 static inline f5_data_t *f5_pack(const int16_t *ad0_arr,
891     const int16_t *ad1_arr, int n, const int *imap, float ad_rho)
892 {
893     f5_data_t *d = malloc(sizeof(f5_data_t));
894 
895     d->ad0_arr = ad0_arr;
896     d->ad1_arr = ad1_arr;
897     d->n = n;
898     d->imap = imap;
899     d->ad_rho = ad_rho;
900 
901     return d;
902 }
903 
get_ad_bdev(const int16_t * ad0_arr,const int16_t * ad1_arr,int n,const int * imap,float ad_rho)904 static float get_ad_bdev(const int16_t *ad0_arr, const int16_t *ad1_arr, int n, const int *imap, float ad_rho) {
905     double bdev = 0.0;
906     f5_data_t *f5_data = f5_pack(ad0_arr, ad1_arr, n, imap, ad_rho);
907     kmin_brent(f5, 0.1, 0.2, f5_data, KMIN_EPS, &bdev);
908     free(f5_data);
909     return (float)bdev < 1e-4 ? (float)NAN : (float)bdev;
910 }
911 
912 /*********************************
913  * WGS AD LIKELIHOODS            *
914  *********************************/
915 
lrr_ad_log_lkl(float lrr,int16_t ad0,int16_t ad1,float ldev,float lrr_sd,float lrr_bias,const beta_binom_t * beta_binom)916 static inline float lrr_ad_log_lkl(float lrr, int16_t ad0, int16_t ad1, float ldev, float lrr_sd, float lrr_bias,
917                                    const beta_binom_t *beta_binom) {
918     return norm_log_lkl(lrr, ldev, lrr_sd, lrr_bias)
919            + log_mean_expf(beta_binom_log_lkl(beta_binom, ad0, ad1), beta_binom_log_lkl(beta_binom, ad1, ad0));
920 }
921 
ad_phase_log_lkl(int16_t ad0,int16_t ad1,int8_t phase,const beta_binom_t * beta_binom)922 static inline float ad_phase_log_lkl(int16_t ad0, int16_t ad1, int8_t phase, const beta_binom_t *beta_binom) {
923     return phase == 0
924                ? log_mean_expf(beta_binom_log_lkl(beta_binom, ad0, ad1), beta_binom_log_lkl(beta_binom, ad1, ad0))
925                : (phase > 0 ? beta_binom_log_lkl(beta_binom, ad0, ad1) : beta_binom_log_lkl(beta_binom, ad1, ad0));
926 }
927 
lrr_ad_emis_log_lkl(const float * lrr,const int16_t * ad0,const int16_t * ad1,int T,const int * imap,float err_log_prb,float lrr_bias,float lrr_hap2dip,float lrr_sd,float ad_rho,const float * bdev_lrr_baf_arr,int m)928 static float *lrr_ad_emis_log_lkl(const float *lrr, const int16_t *ad0, const int16_t *ad1, int T, const int *imap,
929                                   float err_log_prb, float lrr_bias, float lrr_hap2dip, float lrr_sd, float ad_rho,
930                                   const float *bdev_lrr_baf_arr, int m) {
931     int N = 1 + 2 * m;
932     int n1, n2;
933     get_max_sum(ad0, ad1, T, NULL, &n1, &n2);
934     float *emis_log_lkl = (float *)malloc(N * T * sizeof(float));
935     for (int i = 0; i < 1 + m; i++) {
936         float ldev = i == 0 ? 0.0f : -logf(1.0f - 2.0f * bdev_lrr_baf_arr[i - 1]) / (float)M_LN2 * lrr_hap2dip;
937         float bdev = i == 0 ? 0.0f : fabsf(bdev_lrr_baf_arr[i - 1]);
938         beta_binom_t *beta_binom = i == 0 ? beta_binom_null : beta_binom_alt;
939         beta_binom_update(beta_binom, 0.5f + bdev, ad_rho, n1, n2);
940 
941         for (int t = 0; t < T; t++) {
942             float x = imap ? lrr[imap[t]] : lrr[t];
943             int16_t a = imap ? ad0[imap[t]] : ad0[t];
944             int16_t b = imap ? ad1[imap[t]] : ad1[t];
945             emis_log_lkl[t * N + i] = lrr_ad_log_lkl(x, a, b, ldev, lrr_sd, lrr_bias, beta_binom);
946         }
947     }
948     // generate states that should attract LRR waves with no BAF signal
949     for (int i = 0; i < m; i++) {
950         // do not make extreme states compete with LRR waves
951         if (bdev_lrr_baf_arr[i] < -1.0f / 6.0f || bdev_lrr_baf_arr[i] >= 1.0f / 6.0f) {
952             for (int t = 0; t < T; t++) emis_log_lkl[t * N + 1 + m + i] = emis_log_lkl[t * N] + err_log_prb;
953         } else {
954             float ldev = -logf(1.0f - 2.0f * bdev_lrr_baf_arr[i]) / (float)M_LN2 * lrr_hap2dip;
955             for (int t = 0; t < T; t++) {
956                 float x = imap ? lrr[imap[t]] : lrr[t];
957                 int16_t a = imap ? ad0[imap[t]] : ad0[t];
958                 int16_t b = imap ? ad1[imap[t]] : ad1[t];
959                 emis_log_lkl[t * N + 1 + m + i] = lrr_ad_log_lkl(x, a, b, ldev, lrr_sd, lrr_bias, beta_binom_null);
960             }
961         }
962     }
963     for (int t = 0; t < T; t++) rescale_emis_log_lkl(&emis_log_lkl[t * N], N, err_log_prb);
964     return emis_log_lkl;
965 }
966 
ad_phase_emis_log_lkl(const int16_t * ad0,const int16_t * ad1,const int8_t * gt_phase,int T,const int * imap,float err_log_prb,float ad_rho,const float * bdev_arr,int m)967 static float *ad_phase_emis_log_lkl(const int16_t *ad0, const int16_t *ad1, const int8_t *gt_phase, int T,
968                                     const int *imap, float err_log_prb, float ad_rho, const float *bdev_arr, int m) {
969     int N = 1 + 2 * m;
970     float *emis_log_lkl = (float *)malloc(N * T * sizeof(float));
971     for (int i = 0; i < 1 + m; i++) {
972         float bdev = i == 0 ? 0.0f : bdev_arr[i - 1];
973         // TODO this function should come out of the loop
974         int n1, n2;
975         get_max_sum(ad0, ad1, T, imap, &n1, &n2);
976         beta_binom_t *beta_binom = i == 0 ? beta_binom_null : beta_binom_alt;
977         beta_binom_update(beta_binom, 0.5f + bdev, ad_rho, n1, n2);
978 
979         for (int t = 0; t < T; t++) {
980             int16_t a = imap ? ad0[imap[t]] : ad0[t];
981             int16_t b = imap ? ad1[imap[t]] : ad1[t];
982             int8_t p = imap ? gt_phase[imap[t]] : gt_phase[t];
983             emis_log_lkl[t * N + i] = ad_phase_log_lkl(a, b, p, beta_binom);
984             if (i > 0) emis_log_lkl[t * N + m + i] = ad_phase_log_lkl(b, a, p, beta_binom);
985         }
986     }
987     for (int t = 0; t < T; t++) rescale_emis_log_lkl(&emis_log_lkl[t * N], N, err_log_prb);
988     return emis_log_lkl;
989 }
990 
cnp_edge_is_not_cn2_lrr_ad(const float * lrr,int16_t * ad0,int16_t * ad1,int n,int a,int b,float xy_log_prb,float err_log_prb,float lrr_bias,float lrr_hap2dip,float lrr_sd,float ad_rho,float ldev,float bdev)991 static int cnp_edge_is_not_cn2_lrr_ad(const float *lrr, int16_t *ad0, int16_t *ad1, int n, int a, int b,
992                                       float xy_log_prb, float err_log_prb, float lrr_bias, float lrr_hap2dip,
993                                       float lrr_sd, float ad_rho, float ldev, float bdev) {
994     int n1, n2;
995     get_max_sum(ad0, ad1, n, NULL, &n1, &n2);
996     beta_binom_update(beta_binom_null, 0.5f, ad_rho, n1, n2);
997     beta_binom_update(beta_binom_alt, 0.5f + bdev, ad_rho, n1, n2);
998 
999     // test left edge
1000     float sum_log_lkl = 0.0f;
1001     for (int i = a - 1; i >= 0; i--) {
1002         float log_lkl = lrr_ad_log_lkl(lrr[i], ad0[i], ad1[i], ldev, lrr_sd, lrr_bias, beta_binom_alt)
1003                         - lrr_ad_log_lkl(lrr[i], ad0[i], ad1[i], 0.0f, lrr_sd, lrr_bias, beta_binom_null);
1004         if (!isnan(err_log_prb)) {
1005             if (log_lkl < err_log_prb)
1006                 log_lkl = err_log_prb;
1007             else if (log_lkl > -err_log_prb)
1008                 log_lkl = -err_log_prb;
1009         }
1010         sum_log_lkl += log_lkl;
1011         if (sum_log_lkl > -xy_log_prb) return -1;
1012         if (sum_log_lkl < xy_log_prb) break;
1013     }
1014 
1015     // test right edge
1016     sum_log_lkl = 0.0f;
1017     for (int i = b + 1; i < n; i++) {
1018         float log_lkl = lrr_ad_log_lkl(lrr[i], ad0[i], ad1[i], ldev, lrr_sd, lrr_bias, beta_binom_alt)
1019                         - lrr_ad_log_lkl(lrr[i], ad0[i], ad1[i], 0.0f, lrr_sd, lrr_bias, beta_binom_null);
1020         if (!isnan(err_log_prb)) {
1021             if (log_lkl < err_log_prb)
1022                 log_lkl = err_log_prb;
1023             else if (log_lkl > -err_log_prb)
1024                 log_lkl = -err_log_prb;
1025         }
1026         sum_log_lkl += log_lkl;
1027         if (sum_log_lkl > -xy_log_prb) return -1;
1028         if (sum_log_lkl < xy_log_prb) break;
1029     }
1030 
1031     return 0;
1032 }
1033 
1034 // return the LOD likelihood for a segment
lrr_ad_lod(const float * lrr_arr,const int16_t * ad0_arr,const int16_t * ad1_arr,int n,const int * imap,float err_log_prb,float lrr_bias,float lrr_hap2dip,float lrr_sd,float ad_rho,double bdev_lrr_baf)1035 static double lrr_ad_lod(const float *lrr_arr, const int16_t *ad0_arr, const int16_t *ad1_arr, int n, const int *imap,
1036                          float err_log_prb, float lrr_bias, float lrr_hap2dip, float lrr_sd, float ad_rho,
1037                          double bdev_lrr_baf) {
1038     if (n == 0 || bdev_lrr_baf < -0.5 || bdev_lrr_baf > 0.25) return -INFINITY; // kmin_brent does not handle NAN
1039 
1040     float ldev = -logf(1.0f - 2.0f * (float)bdev_lrr_baf) / (float)M_LN2 * lrr_hap2dip;
1041     int n1, n2;
1042     get_max_sum(ad0_arr, ad1_arr, n, imap, &n1, &n2);
1043     beta_binom_update(beta_binom_null, 0.5f, ad_rho, n1, n2);
1044     beta_binom_update(beta_binom_alt, 0.5f + (float)bdev_lrr_baf, ad_rho, n1, n2);
1045     float ret = 0.0f;
1046     for (int i = 0; i < n; i++) {
1047         float lrr = imap ? lrr_arr[imap[i]] : lrr_arr[i];
1048         int16_t ad0 = imap ? ad0_arr[imap[i]] : ad0_arr[i];
1049         int16_t ad1 = imap ? ad1_arr[imap[i]] : ad1_arr[i];
1050         float log_lkl = lrr_ad_log_lkl(lrr, ad0, ad1, ldev, lrr_sd, lrr_bias, beta_binom_alt)
1051                         - lrr_ad_log_lkl(lrr, ad0, ad1, 0.0f, lrr_sd, lrr_bias, beta_binom_null);
1052         if (!isnan(err_log_prb)) {
1053             if (log_lkl < err_log_prb)
1054                 log_lkl = err_log_prb;
1055             else if (log_lkl > -err_log_prb)
1056                 log_lkl = -err_log_prb;
1057         }
1058         ret += log_lkl;
1059     }
1060     return (double)ret * M_LOG10E;
1061 }
1062 
1063 // return the LOD likelihood for a segment
ad_phase_lod(const int16_t * ad0_arr,const int16_t * ad1_arr,const int8_t * gt_phase,int n,const int * imap,const int8_t * as,float err_log_prb,float ad_rho,double bdev)1064 static double ad_phase_lod(const int16_t *ad0_arr, const int16_t *ad1_arr, const int8_t *gt_phase, int n,
1065                            const int *imap, const int8_t *as, float err_log_prb, float ad_rho, double bdev) {
1066     if (n == 0 || bdev < 0.0 || bdev > 0.5) return -INFINITY; // kmin_brent does not handle NAN
1067 
1068     int n1, n2;
1069     get_max_sum(ad0_arr, ad1_arr, n, imap, &n1, &n2);
1070     beta_binom_update(beta_binom_null, 0.5f, ad_rho, n1, n2);
1071     beta_binom_update(beta_binom_alt, 0.5f + (float)bdev, ad_rho, n1, n2);
1072     float ret = 0.0f;
1073     for (int i = 0; i < n; i++) {
1074         int16_t ad0 = imap ? ad0_arr[imap[i]] : ad0_arr[i];
1075         int16_t ad1 = imap ? ad1_arr[imap[i]] : ad1_arr[i];
1076         int8_t p = imap ? gt_phase[imap[i]] : gt_phase[i];
1077         if (as) p *= (int8_t)SIGN(as[i]); // notice as has no imap
1078         float log_lkl = ad_phase_log_lkl(ad0, ad1, p, beta_binom_alt) - ad_phase_log_lkl(ad0, ad1, 0, beta_binom_null);
1079         if (!isnan(err_log_prb)) {
1080             if (log_lkl < err_log_prb)
1081                 log_lkl = err_log_prb;
1082             else if (log_lkl > -err_log_prb)
1083                 log_lkl = -err_log_prb;
1084         }
1085         ret += log_lkl;
1086     }
1087     return (double)ret * M_LOG10E;
1088 }
1089 
1090 typedef struct
1091 {
1092     const int16_t *ad0;
1093     const int16_t *ad1;
1094     const int8_t *gt_phase;
1095     int n;
1096     const int *imap;
1097     int8_t *path;
1098     float err_log_prb;
1099     float ad_rho;
1100 }   f4_data_t;
1101 
f4(double x,void * data)1102 double f4(double x, void *data)
1103 {
1104     f4_data_t *d = data;
1105 
1106     return -ad_phase_lod(d->ad0, d->ad1, d->gt_phase, d->n, d->imap, d->path,
1107         d->err_log_prb, d->ad_rho, x);
1108 }
1109 
f4_pack(const int16_t * ad0,const int16_t * ad1,const int8_t * gt_phase,int n,const int * imap,int8_t * path,float err_log_prb,float ad_rho)1110 static inline f4_data_t *f4_pack(const int16_t *ad0, const int16_t *ad1,
1111     const int8_t *gt_phase, int n, const int *imap, int8_t *path,
1112     float err_log_prb, float ad_rho)
1113 {
1114     f4_data_t *d = malloc(sizeof(f4_data_t));
1115 
1116     d->ad0 = ad0;
1117     d->ad1 = ad1;
1118     d->gt_phase = gt_phase;
1119     d->n = n;
1120     d->imap = imap;
1121     d->path = path;
1122     d->err_log_prb = err_log_prb;
1123     d->ad_rho = ad_rho;
1124 
1125     return d;
1126 }
1127 
1128 // TODO find a better title for this function
compare_wgs_models(const int16_t * ad0,const int16_t * ad1,const int8_t * gt_phase,int n,const int * imap,float xy_log_prb,float err_log_prb,float flip_log_prb,float tel_log_prb,float ad_rho,const float * bdev,int m)1129 static float compare_wgs_models(const int16_t *ad0, const int16_t *ad1, const int8_t *gt_phase, int n, const int *imap,
1130                                 float xy_log_prb, float err_log_prb, float flip_log_prb, float tel_log_prb,
1131                                 float ad_rho, const float *bdev, int m) {
1132     if (n == 0) return NAN;
1133     float *emis_log_lkl = ad_phase_emis_log_lkl(ad0, ad1, gt_phase, n, imap, err_log_prb, ad_rho, bdev, m);
1134     int8_t *path = log_viterbi_run(emis_log_lkl, n, m, xy_log_prb, xy_log_prb, tel_log_prb, flip_log_prb, 0,
1135                                    0); // TODO can I not pass these values instead of 0 0?
1136     free(emis_log_lkl);
1137     int n_flips = 0;
1138     for (int i = 1; i < n; i++)
1139         if (path[i - 1] && path[i] && path[i - 1] != path[i]) n_flips++;
1140     f4_data_t *f4_data = f4_pack(ad0, ad1, gt_phase, n, imap, path, err_log_prb, ad_rho);
1141     double x, fx = kmin_brent(f4, 0.1, 0.2, f4_data, KMIN_EPS, &x);
1142     free(f4_data);
1143     free(path);
1144     return -(float)fx + (float)n_flips * flip_log_prb * (float)M_LOG10E;
1145 }
1146 
1147 // TODO change this or integrate with ad_lod
lod_lkl_beta_binomial(const int16_t * ad0_arr,const int16_t * ad1_arr,int n,const int * imap,double ad_rho)1148 static double lod_lkl_beta_binomial(const int16_t *ad0_arr, const int16_t *ad1_arr, int n, const int *imap,
1149                                     double ad_rho) {
1150     if (n == 0 || ad_rho <= 0.0 || ad_rho >= 1.0) return -INFINITY;
1151     float ret = 0.0f;
1152     int n1, n2;
1153     get_max_sum(ad0_arr, ad1_arr, n, imap, &n1, &n2);
1154     beta_binom_update(beta_binom_null, 0.5f, ad_rho, n1, n2);
1155     for (int i = 0; i < n; i++) {
1156         int16_t ad0 = imap ? ad0_arr[imap[i]] : ad0_arr[i];
1157         int16_t ad1 = imap ? ad1_arr[imap[i]] : ad1_arr[i];
1158         ret += beta_binom_log_lkl(beta_binom_null, ad0, ad1);
1159     }
1160     return (double)ret * M_LOG10E;
1161 }
1162 
1163 /*********************************
1164  * BASIC STATISTICS FUNCTIONS    *
1165  *********************************/
1166 
1167 // iterator of non-NaN values
next_not_missing(const float * v,const int * imap,int n,int * i)1168 static inline float next_not_missing(const float *v, const int *imap, int n, int *i) {
1169     float x = NAN;
1170     while (*i < n) {
1171         x = imap ? v[imap[*i]] : v[*i];
1172         if (!isnan(x)) break;
1173         (*i)++;
1174     }
1175     return x;
1176 }
1177 
1178 // compute BAF phase concordance for a float array with iterator
get_baf_conc(const float * baf,const int8_t * gt_phase,int n,int * conc,int * disc)1179 static void get_baf_conc(const float *baf, const int8_t *gt_phase, int n, int *conc, int *disc) {
1180     int i;
1181     float prev = NAN, next = NAN;
1182     *conc = 0, *disc = 0;
1183     for (i = 0; i < n; i++) {
1184         prev = (baf[i] - 0.5f) * (float)gt_phase[i];
1185         if (!isnan(prev) && prev != 0.0f) break;
1186     }
1187     if (i == n) return;
1188 
1189     for (i++; i < n; i++) {
1190         next = (baf[i] - 0.5f) * (float)gt_phase[i];
1191         if (!isnan(next) && next != 0.0f) {
1192             if (prev * next > 0.0f)
1193                 (*conc)++;
1194             else if (prev * next < 0.0f)
1195                 (*disc)++;
1196             prev = next;
1197         }
1198     }
1199 }
1200 
1201 // compute phased BAF autocorrelation for a float array with iterator
get_baf_auto_corr(const float * baf,const int8_t * gt_phase,int n)1202 static float get_baf_auto_corr(const float *baf, const int8_t *gt_phase, int n) {
1203     double var = 0.0, auto_corr = 0.0;
1204     float prev = NAN, next = NAN;
1205     for (int i = 0; i < n; i++) {
1206         next = (baf[i] - 0.5f) * (float)gt_phase[i];
1207         if (!isnan(next)) {
1208             var += sq((double)next);
1209             if (!isnan(prev)) auto_corr += prev * next;
1210             prev = next;
1211         }
1212     }
1213     auto_corr /= var;
1214     return auto_corr;
1215 }
1216 
1217 // compute sample standard deviation of a float array (with iterator)
1218 // sqrt ( ( \sum x^2 - (\sum x)^2 / N ) / ( N - 1 ) )
get_sample_sd(const float * v,int n,const int * imap)1219 static float get_sample_sd(const float *v, int n, const int *imap) {
1220     float mean = get_sample_mean(v, n, imap);
1221     float s2 = 0.0;
1222     int j = 0;
1223     for (int i = 0; i < n; i++) {
1224         float tmp = imap ? v[imap[i]] : v[i];
1225         if (!isnan(tmp)) {
1226             s2 += sq(tmp - mean);
1227             j++;
1228         }
1229     }
1230     s2 /= (double)(j - 1);
1231     return (float)sqrt(s2);
1232 }
1233 
1234 // compute standard error of mean a float array (with iterator)
1235 // sqrt ( ( \sum x^2 - (\sum x)^2 / N ) / ( N - 1 ) / N )
get_se_mean(const float * v,int n,const int * imap)1236 static float get_se_mean(const float *v, int n, const int *imap) {
1237     int j = 0;
1238     for (int i = 0; i < n; i++) {
1239         float tmp = imap ? v[imap[i]] : v[i];
1240         if (!isnan(tmp)) j++;
1241     }
1242     if (j <= 1) return NAN;
1243 
1244     return get_sample_sd(v, n, imap) / sqrtf(j);
1245 }
1246 
1247 // compute (adjusted) LRR autocorrelation for a float array with iterator
get_lrr_auto_corr(const float * lrr,int n,const int * imap)1248 static float get_lrr_auto_corr(const float *lrr, int n, const int *imap) {
1249     float value;
1250     double mean = 0.0;
1251     int i = 0, j = 0;
1252     for (value = next_not_missing(lrr, imap, n, &i); i < n; i++, value = next_not_missing(lrr, imap, n, &i)) {
1253         mean += (double)value;
1254         j++;
1255     }
1256     if (j <= 1) return NAN;
1257     mean /= (double)j;
1258 
1259     double var = 0.0;
1260     i = 0;
1261     for (value = next_not_missing(lrr, imap, n, &i); i < n; i++, value = next_not_missing(lrr, imap, n, &i)) {
1262         var += sq((double)value - mean);
1263     }
1264 
1265     double auto_corr = 0.0;
1266     i = 0;
1267     double prev = (double)next_not_missing(lrr, imap, n, &i) - mean, next;
1268     for (i++, value = next_not_missing(lrr, imap, n, &i); i < n; i++, value = next_not_missing(lrr, imap, n, &i)) {
1269         next = (double)value - mean;
1270         auto_corr += prev * next;
1271         prev = next;
1272     }
1273     auto_corr /= var;
1274     return auto_corr;
1275 }
1276 
1277 // compute the n50 of the call when split at the heterozygous sites
get_n50_hets(const int * v,const float * baf,int n,int beg_pos,int end_pos)1278 static int get_n50_hets(const int *v, const float *baf, int n, int beg_pos, int end_pos) {
1279     int *w = (int *)malloc((n + 1) * sizeof(int));
1280     int i, j, sum, prev = beg_pos;
1281     for (i = 0, j = 0; i < n; i++) {
1282         if (!isnan(baf[i])) {
1283             w[j++] = v[i] - prev;
1284             prev = v[i];
1285         }
1286     }
1287     w[j++] = end_pos - prev;
1288 
1289     ks_introsort_int((size_t)j, w);
1290 
1291     for (i = 0, sum = 0; sum << 1 < end_pos - beg_pos && i < j; i++) sum += w[i];
1292     int n50 = w[i - 1];
1293     free(w);
1294     return n50;
1295 }
1296 
1297 /*********************************
1298  * SAMPLE METHODS                *
1299  *********************************/
1300 
mocha_print_ucsc(FILE * restrict stream,const mocha_t * mocha,int n,const bcf_hdr_t * hdr)1301 static void mocha_print_ucsc(FILE *restrict stream, const mocha_t *mocha, int n, const bcf_hdr_t *hdr) {
1302     if (stream == NULL) return;
1303     const char *name[4];
1304     name[MOCHA_UNDET] = "mCA_undetermined";
1305     name[MOCHA_LOSS] = "mCA_loss";
1306     name[MOCHA_GAIN] = "mCA_gain";
1307     name[MOCHA_CNLOH] = "mCA_neutral";
1308     const char *desc[4];
1309     desc[MOCHA_UNDET] = "Undetermined";
1310     desc[MOCHA_LOSS] = "Losses";
1311     desc[MOCHA_GAIN] = "Gains";
1312     desc[MOCHA_CNLOH] = "CN-LOHs";
1313     kstring_t tmp = {0, 0, NULL};
1314     for (int i = 0; i < 4; i++) {
1315         fprintf(stream, "track name=%s description=\"%s\" visibility=4 priority=1 itemRgb=\"On\"\n", name[i], desc[i]);
1316         for (int j = 0; j < n; j++) {
1317             uint8_t red = 127, green = 127, blue = 127;
1318             switch (i) {
1319             case MOCHA_LOSS:
1320                 red -= (uint8_t)(127.0f * sqf(mocha[j].cf));
1321                 green -= (uint8_t)(127.0f * sqf(mocha[j].cf));
1322                 blue += (uint8_t)(128.0f * sqf(mocha[j].cf));
1323                 break;
1324             case MOCHA_GAIN:
1325                 red += (uint8_t)(128.0f * sqf(mocha[j].cf));
1326                 green -= (uint8_t)(127.0f * sqf(mocha[j].cf));
1327                 blue -= (uint8_t)(127.0f * sqf(mocha[j].cf));
1328                 break;
1329             case MOCHA_CNLOH:
1330                 red += (uint8_t)(128.0f * mocha[j].cf);
1331                 green += (uint8_t)(38.0f * mocha[j].cf);
1332                 blue -= (uint8_t)(127.0f * mocha[j].cf);
1333                 break;
1334             }
1335             if (i == mocha[j].type) {
1336                 const char *sample_name = bcf_hdr_int2id(hdr, BCF_DT_SAMPLE, mocha[j].sample_idx);
1337                 tmp.l = 0;
1338                 kputs(sample_name, &tmp);
1339                 for (char *p = tmp.s; (p = strchr(p, ' ')); ++p) *p = '_';
1340                 const char *seq_name = bcf_hdr_id2name(hdr, mocha[j].rid);
1341                 if (strncmp(seq_name, "chr", 3) == 0)
1342                     fprintf(stream, "%s\t%d\t%d\t%s\t%d\t.\t%d\t%d\t%d,%d,%d\n", seq_name, mocha[j].beg_pos,
1343                             mocha[j].end_pos, tmp.s, isnan(mocha[j].cf) ? 0 : (int)(1e3 * mocha[j].cf),
1344                             mocha[j].beg_pos, mocha[j].end_pos, red, green, blue);
1345                 else
1346                     fprintf(stream, "chr%s\t%d\t%d\t%s\t%d\t.\t%d\t%d\t%d,%d,%d\n", seq_name, mocha[j].beg_pos,
1347                             mocha[j].end_pos, tmp.s, isnan(mocha[j].cf) ? 0 : (int)(1e3 * mocha[j].cf),
1348                             mocha[j].beg_pos, mocha[j].end_pos, red, green, blue);
1349             }
1350         }
1351     }
1352     free(tmp.s);
1353     if (stream != stdout && stream != stderr) fclose(stream);
1354 }
1355 
mocha_print_calls(FILE * restrict stream,const mocha_t * mocha,int n,const bcf_hdr_t * hdr,int flags,char * genome,float lrr_hap2dip)1356 static void mocha_print_calls(FILE *restrict stream, const mocha_t *mocha, int n, const bcf_hdr_t *hdr, int flags,
1357                               char *genome, float lrr_hap2dip) {
1358     if (stream == NULL) return;
1359     char gender[3];
1360     gender[GENDER_UNKNOWN] = 'U';
1361     gender[GENDER_MALE] = 'M';
1362     gender[GENDER_FEMALE] = 'F';
1363     const char *type[6];
1364     type[MOCHA_UNDET] = "Undetermined";
1365     type[MOCHA_LOSS] = "Loss";
1366     type[MOCHA_GAIN] = "Gain";
1367     type[MOCHA_CNLOH] = "CN-LOH";
1368     type[MOCHA_CNP_LOSS] = "CNP_Loss";
1369     type[MOCHA_CNP_GAIN] = "CNP_Gain";
1370     char arm_type[3];
1371     arm_type[MOCHA_NOT] = 'N';
1372     arm_type[MOCHA_ARM] = 'Y';
1373     arm_type[MOCHA_TEL] = 'T';
1374     fputs("sample_id", stream);
1375     fputs("\tcomputed_gender", stream);
1376     fputs("\tchrom", stream);
1377     fprintf(stream, "\tbeg_%s", genome);
1378     fprintf(stream, "\tend_%s", genome);
1379     fputs("\tlength", stream);
1380     fputs("\tp_arm", stream);
1381     fputs("\tq_arm", stream);
1382     fputs("\tn_sites", stream);
1383     fputs("\tn_hets", stream);
1384     fputs("\tn50_hets", stream);
1385     fputs("\tbdev", stream);
1386     fputs("\tbdev_se", stream);
1387     fputs("\trel_cov", stream);
1388     fputs("\trel_cov_se", stream);
1389     fputs("\tlod_lrr_baf", stream);
1390     fputs("\tlod_baf_phase", stream);
1391     fputs("\tn_flips", stream);
1392     fputs("\tbaf_conc", stream);
1393     fputs("\tlod_baf_conc", stream);
1394     fputs("\ttype", stream);
1395     fputs("\tcf", stream);
1396     fputc('\n', stream);
1397     for (int i = 0; i < n; i++) {
1398         fputs(bcf_hdr_int2id(hdr, BCF_DT_SAMPLE, mocha->sample_idx), stream);
1399         fprintf(stream, "\t%c", gender[mocha->computed_gender]);
1400         fprintf(stream, "\t%s", bcf_hdr_id2name(hdr, mocha->rid));
1401         fprintf(stream, "\t%d", mocha->beg_pos);
1402         fprintf(stream, "\t%d", mocha->end_pos);
1403         fprintf(stream, "\t%d", mocha->length);
1404         fprintf(stream, "\t%c", arm_type[mocha->p_arm]);
1405         fprintf(stream, "\t%c", arm_type[mocha->q_arm]);
1406         fprintf(stream, "\t%d", mocha->n_sites);
1407         fprintf(stream, "\t%d", mocha->n_hets);
1408         fprintf(stream, "\t%d", mocha->n50_hets);
1409         fprintf(stream, "\t%.4f", mocha->bdev);
1410         fprintf(stream, "\t%.4f", mocha->bdev_se);
1411         fprintf(stream, "\t%.4f", 2.0f * expf(mocha->ldev / lrr_hap2dip * (float)M_LN2));
1412         fprintf(stream, "\t%.4f",
1413                 2.0f * expf(mocha->ldev / lrr_hap2dip * (float)M_LN2) * mocha->ldev_se / lrr_hap2dip * (float)M_LN2);
1414         fprintf(stream, "\t%.2f", mocha->lod_lrr_baf);
1415         fprintf(stream, "\t%.2f", mocha->lod_baf_phase);
1416         fprintf(stream, "\t%d", mocha->n_flips);
1417         fprintf(stream, "\t%.4f", mocha->baf_conc);
1418         fprintf(stream, "\t%.2f", mocha->lod_baf_conc);
1419         fprintf(stream, "\t%s", type[mocha->type]);
1420         fprintf(stream, "\t%.4f", mocha->cf);
1421         fputc('\n', stream);
1422         mocha++;
1423     }
1424     if (stream != stdout && stream != stderr) fclose(stream);
1425 }
1426 
1427 // this function returns two values (a, b) such that:
1428 // (i) a <= b (ii) pos[a-1] < beg (iii) beg <= pos[a] < end (iv) beg <= pos[b] < end (v)
1429 // pos[b+1] >= end
get_cnp_edges(const int * pos,int n,int beg,int end,int * a,int * b)1430 static int get_cnp_edges(const int *pos, int n, int beg, int end, int *a, int *b) {
1431     if (pos[0] >= end || pos[n - 1] < beg) return -1;
1432 
1433     int i = 0, j = n - 1, k;
1434     while (j - i > 1) {
1435         k = (i + j) / 2;
1436         if (pos[k] < beg)
1437             i = k;
1438         else
1439             j = k;
1440     }
1441     if (pos[j] >= end) return -1;
1442 
1443     *a = j;
1444 
1445     i = j;
1446     j = n - 1;
1447     while (j - i > 1) {
1448         k = (i + j) / 2;
1449         if (pos[k] < end)
1450             i = k;
1451         else
1452             j = k;
1453     }
1454 
1455     *b = i;
1456     return 0;
1457 }
1458 
1459 // classify mosaic chromosomal alteration type based on LRR and BAF
1460 // LDEV = -log2( 1 - 2 x BDEV ) * LRR-hap2dip for gains
1461 // LDEV = -log2( 1 + 2 x BDEV ) * LRR-hap2dip for losses
mocha_type(float ldev,float ldev_se,float bdev,float bdev_se,int n_hets,float lrr_hap2dip,int8_t p_arm,int8_t q_arm)1462 static int8_t mocha_type(float ldev, float ldev_se, float bdev, float bdev_se, int n_hets, float lrr_hap2dip,
1463                          int8_t p_arm, int8_t q_arm) {
1464     // a LOD score can be computed from a chi-squared statistic by dividing by 2ln(10) ~ 4.6
1465     float z2_cnloh = sqf(ldev / ldev_se);
1466     // equivalent of a 2 LOD score bonus for ending in one but not two telomeres
1467     if ((p_arm != MOCHA_TEL && q_arm != MOCHA_TEL) || (p_arm == MOCHA_TEL && q_arm == MOCHA_TEL))
1468         z2_cnloh += 4.0f * M_LN10;
1469     else
1470         z2_cnloh -= 4.0f * M_LN10;
1471 
1472     // if one model has 4 LOD scores point more than the other model, select the better model
1473     if (ldev > 0) {
1474         if (n_hets < 5 || isnan(bdev)) {
1475             if (!isnan(bdev_se) && z2_cnloh < 8.0 * M_LN10)
1476                 return MOCHA_CNLOH;
1477             else
1478                 return MOCHA_GAIN;
1479         } else {
1480             float expected_ldev =
1481                 -logf(1.0f - 2.0f * bdev > 2.0f / 3.0f ? 1.0f - 2.0f * bdev : 2.0f / 3.0f) * M_LOG2E * lrr_hap2dip;
1482             float z2_gain = sqf((ldev - expected_ldev) / ldev_se);
1483             if (z2_cnloh > z2_gain + 8.0f * M_LN10) return MOCHA_GAIN;
1484             if (z2_gain > z2_cnloh + 8.0f * M_LN10) return MOCHA_CNLOH;
1485         }
1486     } else {
1487         if (n_hets < 5 || isnan(bdev)) {
1488             if (!isnan(bdev_se) && z2_cnloh < 8.0 * M_LN10)
1489                 return MOCHA_CNLOH;
1490             else
1491                 return MOCHA_LOSS;
1492         } else {
1493             float expected_ldev = -logf(1.0f + 2.0f * bdev) * M_LOG2E * lrr_hap2dip;
1494             float z2_loss = sqf((ldev - expected_ldev) / ldev_se);
1495             if (z2_cnloh > z2_loss + 8.0f * M_LN10) return MOCHA_LOSS;
1496             if (z2_loss > z2_cnloh + 8.0f * M_LN10) return MOCHA_CNLOH;
1497         }
1498     }
1499     return MOCHA_UNDET;
1500 }
1501 
1502 // best estimate for cell fraction using the following formula (if BDEV is available):
1503 // BDEV = | 1 / 2 - 1 / CNF |
1504 // CNF = 2 / ( 1 + 2 x BDEV ) for losses
1505 // CNF = 2 / ( 1 - 2 x BDEV ) for gains
1506 // CNF = 2 x 2^( LDEV / LRR-hap2dip )
1507 // LDEV = - LRR-hap2dip / ln(2) * ln( 1 + 2 x BDEV ) for losses
1508 // LDEV = - LRR-hap2dip / ln(2) * ln( 1 - 2 x BDEV ) for gains
mocha_cell_fraction(float ldev,float ldev_se,float bdev,int n_hets,int8_t type,float lrr_hap2dip)1509 static float mocha_cell_fraction(float ldev, float ldev_se, float bdev, int n_hets, int8_t type, float lrr_hap2dip) {
1510     if (n_hets < 5 || isnan(bdev)) {
1511         switch (type) {
1512         case MOCHA_LOSS:
1513             return ldev < -lrr_hap2dip ? 1.0f : -2.0f * (expf(ldev / lrr_hap2dip * (float)M_LN2) - 1.0f);
1514         case MOCHA_GAIN:
1515             return ldev * (float)M_LN2 > lrr_hap2dip * logf(1.5f)
1516                        ? 1.0f
1517                        : 2.0f * (expf(ldev / lrr_hap2dip * (float)M_LN2) - 1.0f);
1518         default:
1519             return NAN;
1520         }
1521     } else {
1522         switch (type) {
1523         case MOCHA_LOSS:
1524             return 4.0f * bdev / (1.0f + 2.0f * bdev);
1525         case MOCHA_GAIN:
1526             return bdev > 1.0f / 6.0f ? 1.0f : 4.0f * bdev / (1.0f - 2.0f * bdev);
1527         case MOCHA_CNLOH:
1528             return 2.0f * bdev;
1529         case MOCHA_UNDET:
1530             return bdev < 0.05f ? 4.0f * bdev : NAN; // here it assumes it is either a loss or a gain
1531         default:
1532             return NAN;
1533         }
1534     }
1535 }
1536 
get_mocha_stats(const int * pos,const float * lrr,const float * baf,const int8_t * gt_phase,int n,int a,int b,int cen_beg,int cen_end,int length,float baf_conc,mocha_t * mocha)1537 static void get_mocha_stats(const int *pos, const float *lrr, const float *baf, const int8_t *gt_phase, int n, int a,
1538                             int b, int cen_beg, int cen_end, int length, float baf_conc, mocha_t *mocha) {
1539     mocha->n_sites = b + 1 - a;
1540 
1541     if (a == 0)
1542         if (pos[a] < cen_beg)
1543             mocha->beg_pos = 0;
1544         else
1545             mocha->beg_pos = cen_end;
1546     else
1547         mocha->beg_pos = pos[a];
1548 
1549     if (b == n - 1)
1550         if (pos[b] >= cen_end)
1551             mocha->end_pos = length;
1552         else
1553             mocha->end_pos = cen_beg;
1554     else
1555         mocha->end_pos = pos[b];
1556 
1557     mocha->length = mocha->end_pos - mocha->beg_pos;
1558 
1559     if (mocha->beg_pos == 0)
1560         mocha->p_arm = MOCHA_TEL;
1561     else if (mocha->beg_pos < cen_beg)
1562         mocha->p_arm = MOCHA_ARM;
1563     else
1564         mocha->p_arm = MOCHA_NOT;
1565 
1566     if (mocha->end_pos == length)
1567         mocha->q_arm = MOCHA_TEL;
1568     else if (mocha->end_pos > cen_end)
1569         mocha->q_arm = MOCHA_ARM;
1570     else
1571         mocha->q_arm = MOCHA_NOT;
1572 
1573     mocha->ldev_se = get_se_mean(lrr + a, b + 1 - a, NULL);
1574     mocha->n_hets = 0;
1575     for (int i = a; i <= b; i++)
1576         if (!isnan(baf[i])) mocha->n_hets++;
1577     int conc, disc;
1578     get_baf_conc(baf + a, gt_phase + a, b + 1 - a, &conc, &disc);
1579     mocha->baf_conc = conc + disc > 0 ? (float)conc / (float)(conc + disc) : NAN;
1580     mocha->lod_baf_conc = ((mocha->baf_conc > 0 ? (float)conc * logf(mocha->baf_conc / baf_conc) : 0)
1581                            + (mocha->baf_conc < 1 ? (float)disc * logf((1 - mocha->baf_conc) / (1 - baf_conc)) : 0))
1582                           * (float)M_LOG10E;
1583     mocha->n50_hets = get_n50_hets(pos + a, baf + a, b + 1 - a, mocha->beg_pos, mocha->end_pos);
1584     mocha->n_flips = -1;
1585     mocha->bdev = NAN;
1586     mocha->bdev_se = NAN;
1587     mocha->lod_baf_phase = NAN;
1588 }
1589 
1590 // return segments called by the HMM or a suggestion of what state should be added to the HMM
get_path_segs(const int8_t * path,const float * hs_arr,int n,int hmm_model,int middle,int ** beg,int * m_beg,int ** end,int * m_end,int * nseg)1591 static float get_path_segs(const int8_t *path, const float *hs_arr, int n, int hmm_model, int middle, int **beg,
1592                            int *m_beg, int **end, int *m_end, int *nseg) {
1593     int a = 0, b = 0;
1594     *nseg = 0;
1595     for (b = 0; b < n; b++) {
1596         // check whether it is the end of a segment
1597         if (b != n - 1) {
1598             int x = abs(path[b]);
1599             int y = abs(path[b + 1]);
1600             if (x == y) continue;
1601 
1602             // swap the two elements
1603             if (x > y) {
1604                 x = abs(path[b + 1]);
1605                 y = abs(path[b]);
1606             }
1607 
1608             if (y - x == 1) {
1609                 if (hmm_model == LRR_BAF && x > 0 && x != middle)
1610                     return (hs_arr[x - 1] + hs_arr[y - 1]) * 0.5f;
1611                 else if (hmm_model == BAF_PHASE)
1612                     return x == 0 ? hs_arr[0] * 0.5f : (hs_arr[x - 1] + hs_arr[y - 1]) * 0.5f;
1613             }
1614         }
1615 
1616         if (path[b]) {
1617             (*nseg)++;
1618             hts_expand(int, *nseg, *m_beg, *beg);
1619             (*beg)[(*nseg) - 1] = a;
1620             hts_expand(int, *nseg, *m_end, *end);
1621             (*end)[(*nseg) - 1] = b;
1622         }
1623         a = b + 1;
1624     }
1625     return 0;
1626 }
1627 
1628 typedef struct
1629 {
1630     float *lrr;
1631     int a;
1632     int16_t *ad0;
1633     int16_t *ad1;
1634     mocha_t *mocha;
1635     const model_t *model;
1636     sample_t *self;
1637     float *baf;
1638 }   f3_data_t;
1639 
f3(double x,void * data)1640 double f3(double x, void *data)
1641 {
1642     f3_data_t *d = data;
1643 
1644     if (d->model->flags & WGS_DATA)
1645         return -lrr_ad_lod(d->lrr + d->a, d->ad0 + d->a, d->ad1 + d->a,
1646             d->mocha->n_sites, NULL, NAN, d->model->lrr_bias,
1647             d->model->lrr_hap2dip, d->self->adjlrr_sd,
1648             d->self->stats.dispersion, x);
1649     else
1650         return -lrr_baf_lod(d->lrr + d->a, d->baf + d->a, d->mocha->n_sites,
1651             NULL, NAN, d->model->lrr_bias, d->model->lrr_hap2dip,
1652             d->self->adjlrr_sd, d->self->stats.dispersion, x);
1653 }
1654 
f3_pack(float * lrr,int a,int16_t * ad0,int16_t * ad1,mocha_t * mocha,const model_t * model,sample_t * self,float * baf)1655 static inline f3_data_t *f3_pack(float *lrr, int a, int16_t *ad0, int16_t *ad1,
1656     mocha_t *mocha, const model_t *model, sample_t *self, float *baf)
1657 
1658 {
1659     f3_data_t *d = malloc(sizeof(f3_data_t));
1660 
1661     d->lrr = lrr;
1662     d->a = a;
1663     d->ad0 = ad0;
1664     d->ad1 = ad1;
1665     d->mocha = mocha;
1666     d->model = model;
1667     d->self = self;
1668     d->baf = baf;
1669 
1670     return d;
1671 }
1672 
1673 typedef struct
1674 {
1675     int16_t *ad0;
1676     int16_t *ad1;
1677     int8_t *gt_phase;
1678     mocha_t *mocha;
1679     int *imap_arr;
1680     int *beg;
1681     int i;
1682     int8_t *path;
1683     sample_t *self;
1684 }   f6_data_t;
1685 
f6(double x,void * data)1686 double f6(double x, void *data)
1687 {
1688     f6_data_t *d = data;
1689 
1690     return -ad_phase_lod(d->ad0, d->ad1, d->gt_phase, d->mocha->n_hets,
1691         d->imap_arr + d->beg[d->i], d->path + d->beg[d->i], NAN, d->self->stats.dispersion, x);
1692 }
1693 
f6_pack(int16_t * ad0,int16_t * ad1,int8_t * gt_phase,mocha_t * mocha,int * imap_arr,int * beg,int i,int8_t * path,sample_t * self)1694 static inline f6_data_t *f6_pack(int16_t *ad0, int16_t *ad1, int8_t *gt_phase,
1695     mocha_t *mocha, int *imap_arr, int *beg, int i, int8_t *path, sample_t *self)
1696 
1697 {
1698     f6_data_t *d = malloc(sizeof(f6_data_t));
1699 
1700     d->ad0 = ad0;
1701     d->ad1 = ad1;
1702     d->gt_phase = gt_phase;
1703     d->mocha = mocha;
1704     d->imap_arr = imap_arr;
1705     d->beg = beg;
1706     d->i = i;
1707     d->path = path;
1708     d->self = self;
1709 
1710     return d;
1711 }
1712 
1713 typedef struct
1714 {
1715     float *baf;
1716     int8_t *gt_phase;
1717     mocha_t *mocha;
1718     int *imap_arr;
1719     int *beg;
1720     int i;
1721     int8_t *path;
1722     sample_t *self;
1723 }   f8_data_t;
1724 
f8(double x,void * data)1725 double f8(double x, void *data)
1726 {
1727     f8_data_t *d = data;
1728 
1729     return -baf_phase_lod(d->baf, d->gt_phase, d->mocha->n_hets,
1730 	d->imap_arr + d->beg[d->i], d->path + d->beg[d->i], NAN,
1731 	d->self->stats.dispersion, x);
1732 }
1733 
f8_pack(float * baf,int8_t * gt_phase,mocha_t * mocha,int * imap_arr,int * beg,int i,int8_t * path,sample_t * self)1734 static inline f8_data_t *f8_pack(float *baf, int8_t *gt_phase, mocha_t *mocha,
1735     int *imap_arr, int *beg, int i, int8_t *path, sample_t *self)
1736 {
1737     f8_data_t *d = malloc(sizeof(f8_data_t));
1738 
1739     d->baf = baf;
1740     d->gt_phase = gt_phase;
1741     d->mocha = mocha;
1742     d->imap_arr = imap_arr;
1743     d->beg = beg;
1744     d->i = i;
1745     d->path = path;
1746     d->self = self;
1747 
1748     return d;
1749 }
1750 
1751 // process one contig for one sample
sample_run(sample_t * self,mocha_table_t * mocha_table,const model_t * model)1752 static void sample_run(sample_t *self, mocha_table_t *mocha_table, const model_t *model) {
1753     // do nothing if chromosome Y or MT are being tested
1754     if (model->rid == model->genome_rules->y_rid || model->rid == model->genome_rules->mt_rid) {
1755         memset(self->data_arr[LDEV], 0, self->n * sizeof(int16_t));
1756         memset(self->data_arr[BDEV], 0, self->n * sizeof(int16_t));
1757         memset(self->phase_arr, 0, self->n * sizeof(int8_t));
1758         return;
1759     }
1760 
1761     mocha_t mocha;
1762     mocha.sample_idx = self->idx;
1763     mocha.computed_gender = self->computed_gender;
1764     mocha.rid = model->rid;
1765 
1766     int cen_beg = model->genome_rules->cen_beg[model->rid];
1767     int cen_end = model->genome_rules->cen_end[model->rid];
1768     int length = model->genome_rules->length[model->rid];
1769     if (length == 0) length = model->locus_arr[model->n - 1].pos;
1770     // incentive to extend to the telomere
1771     float tel_log_prb = model->rid == model->genome_rules->x_rid
1772                             ? (self->computed_gender == GENDER_MALE ? model->chrY_tel_log_prb : model->chrX_tel_log_prb)
1773                             : model->auto_tel_log_prb;
1774 
1775     // declutter code by copying these values onto the stack
1776     int n = self->n;
1777     int8_t *gt_phase = self->phase_arr;
1778     int16_t *ad0 = self->data_arr[AD0];
1779     int16_t *ad1 = self->data_arr[AD1];
1780     float *lrr = (float *)malloc(n * sizeof(float));
1781     float *baf = (float *)malloc(n * sizeof(float));
1782     if (model->flags & WGS_DATA) {
1783         ad_to_lrr_baf(ad0, ad1, lrr, baf, n);
1784     } else {
1785         for (int i = 0; i < n; i++) {
1786             lrr[i] = int16_to_float(self->data_arr[LRR][i]);
1787             baf[i] = int16_to_float(self->data_arr[BAF][i]);
1788         }
1789     }
1790 
1791     if (model->lrr_gc_order > 0 && n > model->lrr_gc_order)
1792         adjust_lrr(lrr, model->gc_arr, n, self->vcf_imap_arr, self->stats.coeffs, model->lrr_gc_order);
1793     else if (model->lrr_gc_order != -1)
1794         adjust_lrr(lrr, model->gc_arr, n, self->vcf_imap_arr, self->stats.coeffs, 0);
1795 
1796     int8_t *as = (int8_t *)calloc(n, sizeof(int8_t));
1797     int *pos = (int *)malloc(n * sizeof(int));
1798     for (int i = 0; i < n; i++) pos[i] = model->locus_arr[self->vcf_imap_arr[i]].pos;
1799     int *imap_arr = (int *)malloc(n * sizeof(int));
1800     int *hets_imap_arr = (int *)malloc(n * sizeof(int));
1801     float *pbaf_arr = (float *)malloc(n * sizeof(float));
1802 
1803     int16_t *ldev = (int16_t *)calloc(n, sizeof(int16_t));
1804     int16_t *bdev = (int16_t *)calloc(n, sizeof(int16_t));
1805 
1806     if (model->rid == model->genome_rules->x_rid && self->computed_gender == GENDER_MALE) {
1807         for (int i = 0; i < n; i++) {
1808             if (pos[i] > model->genome_rules->x_nonpar_beg && pos[i] < model->genome_rules->x_nonpar_end) {
1809                 lrr[i] = NAN;
1810                 baf[i] = NAN;
1811             }
1812         }
1813     }
1814 
1815     if (model->cnp_itr) {
1816         while (regitr_overlap(model->cnp_itr)) {
1817             int a, b;
1818             if (get_cnp_edges(pos, n, model->cnp_itr->beg, model->cnp_itr->end, &a, &b) == 0) {
1819                 int cnp_type = regitr_payload(model->cnp_itr, int);
1820                 float exp_ldev = NAN;
1821                 float exp_bdev = NAN;
1822                 mocha.type = MOCHA_UNDET;
1823                 mocha.ldev = get_median(lrr + a, b + 1 - a, NULL);
1824                 if (mocha.ldev > 0 && (cnp_type == MOCHA_CNP_GAIN || cnp_type == MOCHA_CNP_CNV)) {
1825                     if (model->flags & WGS_DATA)
1826                         mocha.lod_lrr_baf =
1827                             lrr_ad_lod(lrr + a, ad0 + a, ad1 + a, b + 1 - a, NULL, model->err_log_prb, model->lrr_bias,
1828                                        model->lrr_hap2dip, self->adjlrr_sd, self->stats.dispersion, 1.0f / 6.0f);
1829                     else
1830                         mocha.lod_lrr_baf =
1831                             lrr_baf_lod(lrr + a, baf + a, b + 1 - a, NULL, model->err_log_prb, model->lrr_bias,
1832                                         model->lrr_hap2dip, self->adjlrr_sd, self->stats.dispersion, 1.0f / 6.0f);
1833                     if (mocha.lod_lrr_baf
1834                         > -(model->xy_major_log_prb + model->xy_minor_log_prb) / 2.0f * (float)M_LOG10E) {
1835                         mocha.type = MOCHA_CNP_GAIN;
1836                         mocha.cf = NAN;
1837                         exp_ldev = log2f(1.5f) * model->lrr_hap2dip;
1838                         exp_bdev = 1.0f / 6.0f;
1839                     }
1840                 } else if (mocha.ldev <= 0 && (cnp_type == MOCHA_CNP_LOSS || cnp_type == MOCHA_CNP_CNV)) {
1841                     if (model->flags & WGS_DATA)
1842                         mocha.lod_lrr_baf =
1843                             lrr_ad_lod(lrr + a, ad0 + a, ad1 + a, b + 1 - a, NULL, model->err_log_prb, model->lrr_bias,
1844                                        model->lrr_hap2dip, self->adjlrr_sd, self->stats.dispersion, -0.5f);
1845                     else
1846                         mocha.lod_lrr_baf =
1847                             lrr_baf_lod(lrr + a, baf + a, b + 1 - a, NULL, model->err_log_prb, model->lrr_bias,
1848                                         model->lrr_hap2dip, self->adjlrr_sd, self->stats.dispersion, -0.5f);
1849                     if (mocha.lod_lrr_baf
1850                         > -(model->xy_major_log_prb + model->xy_minor_log_prb) / 2.0f * (float)M_LOG10E) {
1851                         mocha.type = MOCHA_CNP_LOSS;
1852                         mocha.cf = NAN;
1853                         exp_ldev = -model->lrr_hap2dip;
1854                         exp_bdev = 0.5f;
1855                     }
1856                 }
1857                 if (mocha.type == MOCHA_CNP_GAIN || mocha.type == MOCHA_CNP_LOSS) {
1858                     if (model->flags & WGS_DATA) {
1859                         if (cnp_edge_is_not_cn2_lrr_ad(lrr, ad0, ad1, n, a, b,
1860                                                        (model->xy_major_log_prb + model->xy_minor_log_prb) / 2.0f,
1861                                                        model->err_log_prb, model->lrr_bias, model->lrr_hap2dip,
1862                                                        self->adjlrr_sd, self->stats.dispersion, exp_ldev, exp_bdev))
1863                             continue;
1864                     } else {
1865                         if (cnp_edge_is_not_cn2_lrr_baf(lrr, baf, n, a, b,
1866                                                         (model->xy_major_log_prb + model->xy_minor_log_prb) / 2.0f,
1867                                                         model->err_log_prb, model->lrr_bias, model->lrr_hap2dip,
1868                                                         self->adjlrr_sd, self->stats.dispersion, exp_ldev, exp_bdev))
1869                             continue;
1870                     }
1871                     get_mocha_stats(pos, lrr, baf, gt_phase, n, a, b, cen_beg, cen_end, length, self->stats.baf_conc,
1872                                     &mocha);
1873                     // compute bdev, if possible
1874                     if (mocha.n_hets > 0) {
1875                         mocha.bdev = model->flags & WGS_DATA
1876                                          ? get_ad_bdev(ad0 + a, ad1 + a, b + 1 - a, NULL, self->stats.dispersion)
1877                                          : get_baf_bdev(baf + a, b + 1 - a, NULL, self->stats.dispersion);
1878                     } else {
1879                         mocha.bdev = NAN;
1880                     }
1881                     mocha_table->n++;
1882                     hts_expand(mocha_t, mocha_table->n, mocha_table->m, mocha_table->a);
1883                     mocha_table->a[mocha_table->n - 1] = mocha;
1884                     for (int j = a; j <= b; j++) {
1885                         // TODO add other stuff here, like setting ldev
1886                         // and bdev
1887                         lrr[j] = NAN; // do not use the data again
1888                         baf[j] = NAN; // do not use the data again
1889                     }
1890                 }
1891             }
1892         }
1893     }
1894 
1895     float *hs_arr = NULL;
1896     int n_hs = 0, m_hs = 0;
1897     for (int hmm_model = 0; hmm_model < 2; hmm_model++) {
1898         // select data to use from the contig, depending on which HMM model is being
1899         // used
1900         int last_p = 0, first_q = 0;
1901         int n_imap = 0;
1902         for (int i = 0; i < n; i++)
1903             if ((hmm_model == LRR_BAF && !isnan(lrr[i])) || (hmm_model == BAF_PHASE && !isnan(baf[i]))) {
1904                 if (pos[i] < cen_beg) last_p++;
1905                 if (pos[i] < cen_end) first_q++;
1906                 imap_arr[n_imap] = i;
1907                 n_imap++;
1908             }
1909         if (n_imap == 0) continue;
1910 
1911         // compute emission probabilities and Viterbi path according to HMM model
1912         int middle = 0;
1913         // TODO eliminate this redundancy
1914         if (hmm_model == LRR_BAF) {
1915             n_hs = model->bdev_lrr_baf_n;
1916             hts_expand(float, n_hs, m_hs, hs_arr);
1917             for (int i = 0; i < model->bdev_lrr_baf_n; i++) {
1918                 hs_arr[i] = model->bdev_lrr_baf[i];
1919                 if (model->bdev_lrr_baf[i] < 0.0f) middle++;
1920             }
1921         } else if (hmm_model == BAF_PHASE) {
1922             n_hs = model->bdev_baf_phase_n;
1923             hts_expand(float, n_hs, m_hs, hs_arr);
1924             for (int i = 0; i < model->bdev_baf_phase_n; i++) hs_arr[i] = model->bdev_baf_phase[i];
1925         }
1926         int8_t *path;
1927         float ret;
1928         int *beg = NULL, m_beg = 0, *end = NULL, m_end = 0, nseg;
1929         do {
1930             if (n_hs + (hmm_model == LRR_BAF ? n_hs : 0) > 50) error("Too many states being tested for the HMM\n");
1931 
1932             float *emis_log_lkl;
1933             if (model->flags & WGS_DATA) {
1934                 emis_log_lkl =
1935                     hmm_model == LRR_BAF
1936                         ? lrr_ad_emis_log_lkl(lrr, ad0, ad1, n_imap, imap_arr, model->err_log_prb, model->lrr_bias,
1937                                               model->lrr_hap2dip, self->adjlrr_sd, self->stats.dispersion, hs_arr, n_hs)
1938                         : ad_phase_emis_log_lkl(ad0, ad1, gt_phase, n_imap, imap_arr, model->err_log_prb,
1939                                                 self->stats.dispersion, hs_arr, n_hs);
1940             } else {
1941                 emis_log_lkl = hmm_model == LRR_BAF
1942                                    ? lrr_baf_emis_log_lkl(lrr, baf, n_imap, imap_arr, model->err_log_prb,
1943                                                           model->lrr_bias, model->lrr_hap2dip, self->adjlrr_sd,
1944                                                           self->stats.dispersion, hs_arr, n_hs)
1945                                    : baf_phase_emis_log_lkl(baf, gt_phase, n_imap, imap_arr, model->err_log_prb,
1946                                                             self->stats.dispersion, hs_arr, n_hs);
1947             }
1948             path = log_viterbi_run(emis_log_lkl, n_imap, n_hs + (hmm_model == LRR_BAF ? n_hs : 0),
1949                                    model->xy_major_log_prb, model->xy_minor_log_prb, tel_log_prb,
1950                                    hmm_model == LRR_BAF ? NAN : model->flip_log_prb, last_p, first_q);
1951             free(emis_log_lkl);
1952 
1953             if (hmm_model == LRR_BAF)
1954                 for (int i = 0; i < n_imap; i++)
1955                     if (path[i] > n_hs) path[i] = 0;
1956             ret = get_path_segs(path, hs_arr, n_imap, hmm_model, middle, &beg, &m_beg, &end, &m_end, &nseg);
1957 
1958             if (ret) // two consecutive hidden states were used, hinting that
1959                      // testing of a middle state might be necessary
1960             {
1961                 free(path);
1962                 n_hs++;
1963                 if (middle && ret < 0.0f) middle++;
1964                 hts_expand(float, n_hs, m_hs, hs_arr);
1965                 hs_arr[n_hs - 1] = ret;
1966                 ks_introsort_float(n_hs, hs_arr);
1967             }
1968         } while (ret);
1969 
1970         // loop through all the segments called by the Viterbi algorithm
1971         for (int i = 0; i < nseg; i++) {
1972             // compute edges of the call
1973             int a = imap_arr[beg[i]];
1974             if (beg[i] == 0)
1975                 while (a > 0 && ldev[a - 1] == 0 && bdev[a - 1] == 0) a--; // extend call towards p telomere
1976             int b = imap_arr[end[i]];
1977             if (end[i] == n_imap - 1)
1978                 while (b < n - 1 && ldev[b + 1] == 0 && bdev[b + 1] == 0) b++; // extend call towards q telomere
1979 
1980             // if an autosomal call spans the whole chromosome and seems an isochromosome event, split it in two
1981             if (model->rid != model->genome_rules->x_rid && a == 0 && b == n - 1 && last_p > 0 && first_q < n_imap) {
1982                 float p_arm_ldev = get_median(lrr + 0, imap_arr[last_p - 1] + 1, NULL);
1983                 float q_arm_ldev = get_median(lrr + imap_arr[first_q], n - imap_arr[first_q], NULL);
1984                 float left_arm_ldev_se = get_se_mean(lrr + 0, imap_arr[last_p - 1] + 1, NULL);
1985                 float right_arm_ldev_se = get_se_mean(lrr + imap_arr[first_q], n - imap_arr[first_q], NULL);
1986                 float z2 = sqf(p_arm_ldev - q_arm_ldev) / (sqf(left_arm_ldev_se) + sqf(right_arm_ldev_se));
1987                 if (p_arm_ldev * q_arm_ldev < 0.0f && z2 > 8.0f * M_LN10) {
1988                     end[i] = last_p - 1;
1989                     b = imap_arr[end[i]];
1990                     nseg++;
1991                     hts_expand(int, nseg, m_beg, beg);
1992                     hts_expand(int, nseg, m_end, end);
1993                     beg[nseg - 1] = first_q;
1994                     end[nseg - 1] = n_imap - 1;
1995                 }
1996             }
1997 
1998             mocha.ldev = get_median(lrr + a, b + 1 - a, NULL);
1999             get_mocha_stats(pos, lrr, baf, gt_phase, n, a, b, cen_beg, cen_end, length, self->stats.baf_conc, &mocha);
2000 
2001             double bdev_lrr_baf;
2002             f3_data_t *f3_data = f3_pack(lrr, a, ad0, ad1, &mocha, model,
2003                  self, baf);
2004             kmin_brent(f3, -0.15, 0.15, f3_data, KMIN_EPS, &bdev_lrr_baf);
2005 	    free(f3_data);
2006             if (model->flags & WGS_DATA)
2007                 mocha.lod_lrr_baf =
2008                     lrr_ad_lod(lrr + a, ad0 + a, ad1 + a, mocha.n_sites, NULL, model->err_log_prb, model->lrr_bias,
2009                                model->lrr_hap2dip, self->adjlrr_sd, self->stats.dispersion, bdev_lrr_baf);
2010             else
2011                 mocha.lod_lrr_baf =
2012                     lrr_baf_lod(lrr + a, baf + a, mocha.n_sites, NULL, model->err_log_prb, model->lrr_bias,
2013                                 model->lrr_hap2dip, self->adjlrr_sd, self->stats.dispersion, bdev_lrr_baf);
2014 
2015             if (hmm_model == LRR_BAF) {
2016                 // here you need to check whether the call would have been
2017                 // better with the phased HMM model
2018                 int n_hets_imap = 0;
2019                 for (int j = beg[i]; j <= end[i]; j++)
2020                     if (!isnan(baf[imap_arr[j]])) {
2021                         n_hets_imap++;
2022                         hets_imap_arr[n_hets_imap - 1] = imap_arr[j];
2023                     }
2024                 // TODO here it needs to pass information about the centromeres
2025                 if (model->flags & WGS_DATA) {
2026                     mocha.lod_baf_phase =
2027                         compare_wgs_models(ad0, ad1, gt_phase, n_hets_imap, hets_imap_arr,
2028                                            (model->xy_major_log_prb + model->xy_minor_log_prb) / 2.0f,
2029                                            model->err_log_prb, model->flip_log_prb, tel_log_prb, self->stats.dispersion,
2030                                            model->bdev_baf_phase, model->bdev_baf_phase_n);
2031                 } else {
2032                     mocha.lod_baf_phase =
2033                         compare_models(baf, gt_phase, n_hets_imap, hets_imap_arr,
2034                                        (model->xy_major_log_prb + model->xy_minor_log_prb) / 2.0f, model->err_log_prb,
2035                                        model->flip_log_prb, tel_log_prb, self->stats.dispersion, model->bdev_baf_phase,
2036                                        model->bdev_baf_phase_n);
2037                 }
2038                 if (mocha.lod_baf_phase > 0.8 * mocha.lod_lrr_baf) continue;
2039 
2040                 // compute bdev, if possible
2041                 if (n_hets_imap > 0) {
2042                     mocha.bdev = model->flags & WGS_DATA
2043                                      ? get_ad_bdev(ad0, ad1, n_hets_imap, hets_imap_arr, self->stats.dispersion)
2044                                      : get_baf_bdev(baf, n_hets_imap, hets_imap_arr, self->stats.dispersion);
2045                 } else {
2046                     mocha.bdev = NAN;
2047                 }
2048                 mocha.bdev_se = NAN;
2049                 for (int j = 0; j < n_hets_imap; j++) as[hets_imap_arr[j]] = (int8_t)SIGN(baf[hets_imap_arr[j]] - 0.5f);
2050             } else {
2051                 // penalizes the LOD by the number of phase flips
2052                 mocha.n_flips = 0;
2053                 for (int j = beg[i]; j < end[i]; j++)
2054                     if (path[j] != path[j + 1]) mocha.n_flips++;
2055 
2056                 if (model->flags & WGS_DATA) {
2057                     double bdev;
2058 		    f6_data_t *f6_data = f6_pack(ad0, ad1, gt_phase, &mocha,
2059                         imap_arr, beg, i, path, self);
2060                     kmin_brent(f6, 0.1, 0.2, f6_data, KMIN_EPS, &bdev);
2061 		    free(f6_data);
2062                     mocha.bdev = fabsf((float)bdev);
2063                     mocha.lod_baf_phase =
2064                         ad_phase_lod(ad0, ad1, gt_phase, mocha.n_hets, imap_arr + beg[i], path + beg[i],
2065                                      model->err_log_prb, self->stats.dispersion, mocha.bdev);
2066                 } else {
2067                     double bdev;
2068 		    f8_data_t *f8_data = f8_pack(baf, gt_phase, &mocha,
2069 			imap_arr, beg, i, path, self);
2070                     kmin_brent(f8, 0.1, 0.2, f8_data, KMIN_EPS, &bdev);
2071 		    free(f8_data);
2072                     mocha.bdev = fabsf((float)bdev);
2073                     mocha.lod_baf_phase = baf_phase_lod(baf, gt_phase, mocha.n_hets, imap_arr + beg[i], path + beg[i],
2074                                                         model->err_log_prb, self->stats.dispersion, mocha.bdev);
2075                     // for larger bdev estimates, a model with phase might underestimate the deviation due to switch
2076                     // errors
2077                     if (mocha.bdev > self->stats.dispersion) {
2078                         float bdev = get_baf_bdev(baf, mocha.n_hets, imap_arr + beg[i], self->stats.dispersion);
2079                         if (!isnan(bdev)) mocha.bdev = bdev;
2080                     }
2081                 }
2082                 mocha.lod_baf_phase += (float)mocha.n_flips * model->flip_log_prb * (float)M_LOG10E;
2083 
2084                 for (int j = 0; j < mocha.n_hets; j++)
2085                     pbaf_arr[j] = (baf[imap_arr[beg[i] + j]] - 0.5f) * (float)SIGN(path[beg[i] + j]);
2086                 mocha.bdev_se = get_se_mean(pbaf_arr, mocha.n_hets, NULL);
2087                 for (int j = beg[i]; j <= end[i]; j++) as[imap_arr[j]] = (int8_t)SIGN(path[j]) * gt_phase[imap_arr[j]];
2088             }
2089 
2090             mocha.type = mocha_type(mocha.ldev, mocha.ldev_se, mocha.bdev, mocha.bdev_se, mocha.n_hets,
2091                                     model->lrr_hap2dip, mocha.p_arm, mocha.q_arm);
2092             mocha.cf = mocha_cell_fraction(mocha.ldev, mocha.ldev_se, mocha.bdev, mocha.n_hets, mocha.type,
2093                                            model->lrr_hap2dip);
2094             mocha_table->n++;
2095             hts_expand(mocha_t, mocha_table->n, mocha_table->m, mocha_table->a);
2096             mocha_table->a[mocha_table->n - 1] = mocha;
2097 
2098             // update information that will be stored in the output VCF and make
2099             // remaining sites missing
2100             for (int j = a; j <= b; j++) {
2101                 if (ldev[j] == 0) ldev[j] = float_to_int16(mocha.ldev);
2102                 if (bdev[j] == 0) bdev[j] = float_to_int16(mocha.bdev);
2103                 lrr[j] = NAN; // do not use the data again
2104                 baf[j] = NAN; // do not use the data again
2105             }
2106         }
2107         free(path);
2108         free(beg);
2109         free(end);
2110     }
2111 
2112     // clean up
2113     free(hs_arr);
2114     free(imap_arr);
2115     free(hets_imap_arr);
2116     free(pbaf_arr);
2117     free(pos);
2118     free(lrr);
2119     free(baf);
2120     memcpy(self->data_arr[LDEV], ldev, n * sizeof(int16_t));
2121     memcpy(self->data_arr[BDEV], bdev, n * sizeof(int16_t));
2122     memcpy(self->phase_arr, as, n * sizeof(int8_t));
2123     free(ldev);
2124     free(bdev);
2125     free(as);
2126 }
2127 
2128 // computes the medoid contig for LRR regression
2129 // TODO weight the coefficients appropriately
get_medoid(const float * coeffs,int n,int order)2130 static int get_medoid(const float *coeffs, int n, int order) {
2131     int medoid_idx = -1;
2132     float prev = INFINITY;
2133     for (int i = 0; i < n; i++) {
2134         float next = 0.0f;
2135         for (int j = 0; j < n; j++) {
2136             if (i != j) {
2137                 for (int k = 0; k <= order; k++) {
2138                     next += fabsf(coeffs[i * (order + 1) + k] - coeffs[j * (order + 1) + k]);
2139                 }
2140             }
2141         }
2142         if (next < prev) {
2143             prev = next;
2144             medoid_idx = i;
2145         }
2146     }
2147     return medoid_idx;
2148 }
2149 
2150 // groups numbers in two separate distributions
get_lrr_cutoff(const float * v,int n)2151 static float get_lrr_cutoff(const float *v, int n) {
2152     if (n <= 1) return NAN;
2153     float *w = (float *)malloc(n * sizeof(float));
2154     int j = 0;
2155     for (int i = 0; i < n; i++) {
2156         if (!isnan(v[i])) w[j++] = v[i];
2157     }
2158     if (j <= 1) {
2159         free(w);
2160         return NAN;
2161     }
2162     ks_introsort_float((size_t)j, w);
2163 
2164     // identify a reasonable initial split allowing for some outliers
2165     int k = j / 2;
2166     int d = (int)sqrtf((float)j) - 1;
2167     while (k > 1 && w[k - 1] - w[d] > w[j - 1 - d] - w[k - 1]) k--;
2168     while (k < j && w[k] - w[d] < w[j - 1 - d] - w[k]) k++;
2169 
2170     // run k-means clustering EM
2171     while (k > 0
2172            && w[k - 1] - (w[(k - 1) / 2] + w[k / 2]) * 0.5f > (w[(j + k - 1) / 2] + w[(j + k) / 2]) * 0.5f - w[k - 1])
2173         k--;
2174     while (k < j && w[k] - (w[(k - 1) / 2] + w[k / 2]) * 0.5f < (w[(j + k - 1) / 2] + w[(j + k) / 2]) * 0.5f - w[k])
2175         k++;
2176 
2177     float cutoff = (k > 0 && k < j) ? (w[k - 1] + w[k]) * 0.5f : NAN;
2178     free(w);
2179     return cutoff;
2180 }
2181 
2182 typedef struct
2183 {
2184     int16_t *ad0;
2185     int16_t *ad1;
2186     int n;
2187 }   f2_data_t;
2188 
f2(double x,void * data)2189 double f2(double x, void *data)
2190 {
2191     f2_data_t *d = data;
2192 
2193     return -lod_lkl_beta_binomial(d->ad0, d->ad1, d->n, NULL, x);
2194 }
2195 
f2_pack(int16_t * ad0,int16_t * ad1,int n)2196 static inline f2_data_t *f2_pack(int16_t *ad0, int16_t *ad1, int n)
2197 
2198 {
2199     f2_data_t *d = malloc(sizeof(f2_data_t));
2200 
2201     d->ad0 = ad0;
2202     d->ad1 = ad1;
2203     d->n = n;
2204 
2205     return d;
2206 }
2207 
2208 typedef struct
2209 {
2210     int16_t *ad0;
2211     int16_t *ad1;
2212     int n_imap;
2213     int *imap_arr;
2214 }   f9_data_t;
2215 
f9(double x,void * data)2216 double f9(double x, void *data)
2217 {
2218     f9_data_t *d = data;
2219 
2220     return -lod_lkl_beta_binomial(d->ad0, d->ad1, d->n_imap, d->imap_arr, x);
2221 }
2222 
f9_pack(int16_t * ad0,int16_t * ad1,int n_imap,int * imap_arr)2223 static inline f9_data_t *f9_pack(int16_t *ad0, int16_t *ad1, int n_imap,
2224     int *imap_arr)
2225 {
2226     f9_data_t *d = malloc(sizeof(f9_data_t));
2227 
2228     d->ad0 = ad0;
2229     d->ad1 = ad1;
2230     d->n_imap = n_imap;
2231     d->imap_arr = imap_arr;
2232 
2233     return d;
2234 }
2235 
2236 // this function computes several contig stats and then clears the contig data from the sample
sample_stats(sample_t * self,const model_t * model)2237 static void sample_stats(sample_t *self, const model_t *model) {
2238     int n = self->n;
2239     if (n == 0) return;
2240     self->n_sites += n;
2241     for (int i = 0; i < n; i++)
2242         if (self->phase_arr[i] == bcf_int8_missing) self->n_missing_gts++;
2243 
2244     int16_t *ad0 = self->data_arr[AD0];
2245     int16_t *ad1 = self->data_arr[AD1];
2246     float *lrr = (float *)malloc(n * sizeof(float));
2247     float *baf = (float *)malloc(n * sizeof(float));
2248     if (model->flags & WGS_DATA) {
2249         ad_to_lrr_baf(ad0, ad1, lrr, baf, n);
2250     } else {
2251         for (int i = 0; i < n; i++) {
2252             lrr[i] = int16_to_float(self->data_arr[LRR][i]);
2253             baf[i] = int16_to_float(self->data_arr[BAF][i]);
2254         }
2255     }
2256     int *imap_arr = (int *)malloc(n * sizeof(int));
2257 
2258     if (model->rid == model->genome_rules->x_rid) {
2259         int n_imap = 0;
2260         for (int i = 0; i < n; i++) {
2261             if (!isnan(baf[i])) self->n_hets++;
2262             int pos = model->locus_arr[self->vcf_imap_arr[i]].pos;
2263             if (pos > model->genome_rules->x_nonpar_beg && pos < model->genome_rules->x_nonpar_end
2264                 && (pos < model->genome_rules->x_xtr_beg || pos > model->genome_rules->x_xtr_end)) {
2265                 if (!isnan(baf[i])) self->x_nonpar_n_hets++;
2266                 n_imap++;
2267                 imap_arr[n_imap - 1] = i;
2268             } else if (!isnan(baf[i])) {
2269                 if (pos <= model->genome_rules->x_nonpar_beg) {
2270                     self->par1_n_hets++;
2271                 } else if (pos >= model->genome_rules->x_xtr_beg || pos <= model->genome_rules->x_xtr_end) {
2272                     self->xtr_n_hets++;
2273                 } else if (pos >= model->genome_rules->x_nonpar_end) {
2274                     self->par2_n_hets++;
2275                 }
2276             }
2277         }
2278         self->x_nonpar_lrr_median = get_median(lrr, n_imap, imap_arr);
2279 
2280         if (model->flags & WGS_DATA) {
2281             f9_data_t *f9_data = f9_pack(ad0, ad1, n_imap, imap_arr);
2282             double x;
2283             kmin_brent(f9, 0.1, 0.2, f9_data, KMIN_EPS, &x); // dispersions above 0.5 are not allowed
2284 	    free(f9_data);
2285             self->x_nonpar_dispersion = (float)x;
2286         } else {
2287             self->x_nonpar_dispersion = get_sample_sd(baf, n_imap, imap_arr);
2288         }
2289     } else if (model->rid == model->genome_rules->y_rid) {
2290         int n_imap = 0;
2291         for (int i = 0; i < n; i++) {
2292             if (!isnan(baf[i])) self->n_hets++;
2293             int pos = model->locus_arr[self->vcf_imap_arr[i]].pos;
2294             if (pos > model->genome_rules->y_nonpar_beg && pos < model->genome_rules->y_nonpar_end
2295                 && (pos < model->genome_rules->y_xtr_beg || pos > model->genome_rules->y_xtr_end)) {
2296                 n_imap++;
2297                 imap_arr[n_imap - 1] = i;
2298             }
2299         }
2300         self->y_nonpar_lrr_median = get_median(lrr, n_imap, imap_arr);
2301     } else if (model->rid == model->genome_rules->mt_rid) {
2302         self->mt_lrr_median = get_median(lrr, n, NULL);
2303     } else {
2304         // expand arrays if necessary
2305         self->n_stats++;
2306         hts_expand(stats_t, self->n_stats, self->m_stats, self->stats_arr);
2307 
2308         if (model->flags & WGS_DATA) {
2309             double x;
2310 	    f2_data_t *f2_data = f2_pack(ad0, ad1, n);
2311             kmin_brent(f2, 0.1, 0.2, f2_data, KMIN_EPS, &x); // dispersions above 0.5 are not allowed
2312 	    free(f2_data);
2313             self->stats_arr[self->n_stats - 1].dispersion = (float)x;
2314         } else {
2315             self->stats_arr[self->n_stats - 1].dispersion = get_sample_sd(baf, n, NULL);
2316         }
2317         for (int i = 0; i < n; i++)
2318             if (!isnan(baf[i])) self->n_hets++;
2319         self->stats_arr[self->n_stats - 1].lrr_median = get_median(lrr, n, NULL);
2320         self->stats_arr[self->n_stats - 1].lrr_sd = get_sample_sd(lrr, n, NULL);
2321 
2322         int conc, disc;
2323         get_baf_conc(baf, self->phase_arr, n, &conc, &disc);
2324         self->stats_arr[self->n_stats - 1].baf_conc = (float)conc / (float)(conc + disc);
2325         self->stats_arr[self->n_stats - 1].baf_auto = get_baf_auto_corr(baf, self->phase_arr, n);
2326         // performs polynomial regression for LRR
2327         if (model->lrr_gc_order > 0 && n > model->lrr_gc_order) {
2328             float tss = get_tss(lrr, n);
2329             int ret = polyfit(lrr, model->gc_arr, n, self->vcf_imap_arr, model->lrr_gc_order,
2330                               self->stats_arr[self->n_stats - 1].coeffs);
2331             if (ret < 0) error("Polynomial regression failed\n");
2332             adjust_lrr(lrr, model->gc_arr, n, self->vcf_imap_arr, self->stats_arr[self->n_stats - 1].coeffs,
2333                        model->lrr_gc_order);
2334             self->stats_arr[self->n_stats - 1].coeffs[0] += get_median(lrr, n, NULL); // further adjusts by median
2335             float rss = get_tss(lrr, n);
2336             self->stats_arr[self->n_stats - 1].lrr_gc_rel_ess = 1.0f - rss / tss;
2337         } else if (model->lrr_gc_order != -1) {
2338             self->stats_arr[self->n_stats - 1].coeffs[0] = get_median(lrr, n, NULL);
2339             self->stats_arr[self->n_stats - 1].lrr_gc_rel_ess = NAN;
2340         }
2341         // compute autocorrelation after GC correction
2342         self->stats_arr[self->n_stats - 1].lrr_auto = get_lrr_auto_corr(lrr, n, NULL);
2343     }
2344 
2345     free(lrr);
2346     free(baf);
2347     free(imap_arr);
2348 }
2349 
2350 // this function computes the median of contig stats
sample_summary(sample_t * self,int n,model_t * model,int compute_gender)2351 static void sample_summary(sample_t *self, int n, model_t *model, int compute_gender) {
2352     float *tmp_arr = (float *)malloc(n * sizeof(float));
2353     int m_tmp = n;
2354 
2355     for (int i = 0; i < n; i++) {
2356         hts_expand(float, self[i].n_stats *(model->lrr_gc_order < 0 ? 1 : model->lrr_gc_order + 1), m_tmp, tmp_arr);
2357 
2358         for (int j = 0; j < self[i].n_stats; j++) tmp_arr[j] = self[i].stats_arr[j].lrr_median;
2359         self[i].stats.lrr_median = get_median(tmp_arr, self[i].n_stats, NULL);
2360         for (int j = 0; j < self[i].n_stats; j++) tmp_arr[j] = self[i].stats_arr[j].lrr_sd;
2361         self[i].stats.lrr_sd = get_median(tmp_arr, self[i].n_stats, NULL);
2362         for (int j = 0; j < self[i].n_stats; j++) tmp_arr[j] = self[i].stats_arr[j].lrr_auto;
2363         self[i].stats.lrr_auto = get_median(tmp_arr, self[i].n_stats, NULL);
2364 
2365         for (int j = 0; j < self[i].n_stats; j++) tmp_arr[j] = self[i].stats_arr[j].dispersion;
2366         self[i].stats.dispersion = get_median(tmp_arr, self[i].n_stats, NULL);
2367         for (int j = 0; j < self[i].n_stats; j++) tmp_arr[j] = self[i].stats_arr[j].baf_conc;
2368         self[i].stats.baf_conc = get_median(tmp_arr, self[i].n_stats, NULL);
2369         for (int j = 0; j < self[i].n_stats; j++) tmp_arr[j] = self[i].stats_arr[j].baf_auto;
2370         self[i].stats.baf_auto = get_median(tmp_arr, self[i].n_stats, NULL);
2371 
2372         self[i].adjlrr_sd = self[i].stats.lrr_sd;
2373         if (model->lrr_gc_order == 0) {
2374             for (int j = 0; j < self[i].n_stats; j++) tmp_arr[j] = self[i].stats_arr[j].coeffs[0];
2375             self[i].stats.coeffs[0] = get_median(tmp_arr, self[i].n_stats, NULL);
2376         } else if (model->lrr_gc_order > 0 && self[i].n_stats > 0) {
2377             for (int j = 0; j < self[i].n_stats; j++)
2378                 for (int k = 0; k <= model->lrr_gc_order; k++)
2379                     tmp_arr[j * (model->lrr_gc_order + 1) + k] = self[i].stats_arr[j].coeffs[k];
2380             int medoid_idx = get_medoid(tmp_arr, self[i].n_stats, model->lrr_gc_order);
2381             for (int k = 0; k <= model->lrr_gc_order; k++)
2382                 self[i].stats.coeffs[k] = tmp_arr[medoid_idx * (model->lrr_gc_order + 1) + k];
2383             for (int j = 0; j < self[i].n_stats; j++) tmp_arr[j] = self[i].stats_arr[j].lrr_gc_rel_ess;
2384             self[i].stats.lrr_gc_rel_ess = get_median(tmp_arr, self[i].n_stats, NULL);
2385             self[i].adjlrr_sd *= sqrtf(1.0f - self[i].stats.lrr_gc_rel_ess); // not perfect, but good enough(?)
2386         }
2387         free(self[i].stats_arr);
2388     }
2389 
2390     if (compute_gender) {
2391         if (model->flags & WGS_DATA) {
2392             if (isnan(model->lrr_cutoff)) model->lrr_cutoff = -0.3f; // arbitrary cutoff between -M_LN2 and 0
2393         }
2394 
2395         // determine LRR cutoff between haploid and diploid
2396         if (isnan(model->lrr_cutoff)) {
2397             int j = 0;
2398             for (int i = 0; i < n; i++)
2399                 if (!isnan(self[i].x_nonpar_lrr_median))
2400                     tmp_arr[j++] = isnan(self[i].stats.lrr_median)
2401                                        ? self[i].x_nonpar_lrr_median
2402                                        : self[i].x_nonpar_lrr_median - self[i].stats.lrr_median;
2403             model->lrr_cutoff = get_lrr_cutoff(tmp_arr, j);
2404         }
2405 
2406         // determine gender of samples
2407         for (int i = 0; i < n; i++) {
2408             float tmp = isnan(self[i].stats.lrr_median) ? self[i].x_nonpar_lrr_median
2409                                                         : self[i].x_nonpar_lrr_median - self[i].stats.lrr_median;
2410             if (tmp < model->lrr_cutoff)
2411                 self[i].computed_gender = GENDER_MALE;
2412             else if (tmp > model->lrr_cutoff)
2413                 self[i].computed_gender = GENDER_FEMALE;
2414         }
2415     }
2416 
2417     free(tmp_arr);
2418 }
2419 
mocha_print_stats(FILE * restrict stream,const sample_t * self,int n,int lrr_gc_order,const bcf_hdr_t * hdr,int flags)2420 static void mocha_print_stats(FILE *restrict stream, const sample_t *self, int n, int lrr_gc_order,
2421                               const bcf_hdr_t *hdr, int flags) {
2422     if (stream == NULL) return;
2423     char gender[3];
2424     gender[GENDER_UNKNOWN] = 'U';
2425     gender[GENDER_MALE] = 'M';
2426     gender[GENDER_FEMALE] = 'F';
2427     fputs("sample_id", stream);
2428     fputs("\tcomputed_gender", stream);
2429     fputs("\tcall_rate", stream);
2430     fputs(flags & WGS_DATA ? "\tcov_median" : "\tlrr_median", stream);
2431     fputs(flags & WGS_DATA ? "\tcov_sd" : "\tlrr_sd", stream);
2432     fputs(flags & WGS_DATA ? "\tcov_auto" : "\tlrr_auto", stream);
2433     fputs(flags & WGS_DATA ? "\tbaf_corr" : "\tbaf_sd", stream);
2434     fputs("\tbaf_conc", stream);
2435     fputs("\tbaf_auto", stream);
2436     fputs("\tn_sites", stream);
2437     fputs("\tn_hets", stream);
2438     fputs("\tx_nonpar_n_hets", stream);
2439     fputs("\tpar1_n_hets", stream);
2440     fputs("\txtr_n_hets", stream);
2441     fputs("\tpar2_n_hets", stream);
2442     fputs("\tx_nonpar_baf_corr", stream);
2443     fputs(flags & WGS_DATA ? "\tx_nonpar_cov_median" : "\tx_nonpar_lrr_median", stream);
2444     fputs(flags & WGS_DATA ? "\ty_nonpar_cov_median" : "\ty_nonpar_lrr_median", stream);
2445     fputs(flags & WGS_DATA ? "\tmt_cov_median" : "\tmt_lrr_median", stream);
2446     fputs("\tlrr_gc_rel_ess", stream);
2447     for (int j = 0; j <= lrr_gc_order; j++) fprintf(stream, "\tlrr_gc_%d", j);
2448     fputc('\n', stream);
2449     for (int i = 0; i < n; i++) {
2450         fputs(bcf_hdr_int2id(hdr, BCF_DT_SAMPLE, self[i].idx), stream);
2451         fprintf(stream, "\t%c", gender[self[i].computed_gender]);
2452         fprintf(stream, "\t%.4f",
2453                 self[i].stats.call_rate ? self[i].stats.call_rate
2454                                         : 1.0 - (float)self[i].n_missing_gts / (float)self[i].n_sites);
2455         fprintf(stream, "\t%.4f", flags & WGS_DATA ? expf(self[i].stats.lrr_median) : self[i].stats.lrr_median);
2456         fprintf(stream, "\t%.4f",
2457                 flags & WGS_DATA ? expf(self[i].stats.lrr_median) * self[i].stats.lrr_sd : self[i].stats.lrr_sd);
2458         fprintf(stream, "\t%.4f", self[i].stats.lrr_auto);
2459         fprintf(stream, "\t%.4f", self[i].stats.dispersion);
2460         fprintf(stream, "\t%.4f", self[i].stats.baf_conc);
2461         fprintf(stream, "\t%.4f", self[i].stats.baf_auto);
2462         fprintf(stream, "\t%d", self[i].n_sites);
2463         fprintf(stream, "\t%d", self[i].n_hets);
2464         fprintf(stream, "\t%d", self[i].x_nonpar_n_hets);
2465         fprintf(stream, "\t%d", self[i].par1_n_hets);
2466         fprintf(stream, "\t%d", self[i].xtr_n_hets);
2467         fprintf(stream, "\t%d", self[i].par2_n_hets);
2468         fprintf(stream, "\t%.4f", self[i].x_nonpar_dispersion);
2469         fprintf(stream, "\t%.4f", flags & WGS_DATA ? expf(self[i].x_nonpar_lrr_median) : self[i].x_nonpar_lrr_median);
2470         fprintf(stream, "\t%.4f", flags & WGS_DATA ? expf(self[i].y_nonpar_lrr_median) : self[i].y_nonpar_lrr_median);
2471         fprintf(stream, "\t%.4f", flags & WGS_DATA ? expf(self[i].mt_lrr_median) : self[i].mt_lrr_median);
2472         fprintf(stream, "\t%.4f", self[i].stats.lrr_gc_rel_ess);
2473         for (int j = 0; j <= lrr_gc_order; j++) fprintf(stream, "\t%.4f", self[i].stats.coeffs[j]);
2474         fputc('\n', stream);
2475     }
2476     if (stream != stdout && stream != stderr) fclose(stream);
2477 }
2478 
2479 /*********************************
2480  * VCF READ AND WRITE METHODS    *
2481  *********************************/
2482 
2483 // moves synced bcf reader to first useful line for reader0
bcf_sr_next_line_reader0(bcf_srs_t * sr)2484 static inline int bcf_sr_next_line_reader0(bcf_srs_t *sr) {
2485     int nret = bcf_sr_next_line(sr);
2486     while (nret > 0 && !bcf_sr_has_line(sr, 0)) nret = bcf_sr_next_line(sr);
2487     return nret;
2488 }
2489 
2490 // write one contig
put_contig(bcf_srs_t * sr,const sample_t * sample,const model_t * model,htsFile * out_fh,bcf_hdr_t * out_hdr)2491 static int put_contig(bcf_srs_t *sr, const sample_t *sample, const model_t *model, htsFile *out_fh,
2492                       bcf_hdr_t *out_hdr) {
2493     int rid = model->rid;
2494     bcf_hdr_t *hdr = bcf_sr_get_header(sr, 0);
2495     bcf_sr_seek(sr, bcf_hdr_id2name(hdr, rid), 0);
2496     int nsmpl = bcf_hdr_nsamples(out_hdr);
2497 
2498     int *synced_iter = (int *)calloc(nsmpl, sizeof(int));
2499     float *bdev = (float *)calloc(nsmpl, sizeof(float));
2500     float *ldev = (float *)calloc(nsmpl, sizeof(float));
2501     int *as = (int *)calloc(nsmpl, sizeof(int));
2502 
2503     int i;
2504     for (i = 0; bcf_sr_next_line_reader0(sr); i++) {
2505         bcf1_t *line = bcf_sr_get_line(sr, 0);
2506         if (rid != line->rid) break;
2507 
2508         for (int j = 0; j < nsmpl; j++) {
2509             while (synced_iter[j] < sample[j].n - 1 && sample[j].vcf_imap_arr[synced_iter[j]] < i) synced_iter[j]++;
2510             if (sample[j].vcf_imap_arr[synced_iter[j]] == i) {
2511                 if (sample[j].data_arr[LDEV])
2512                     ldev[sample[j].idx] = int16_to_float(sample[j].data_arr[LDEV][synced_iter[j]]);
2513                 if (sample[j].data_arr[BDEV])
2514                     bdev[sample[j].idx] = int16_to_float(sample[j].data_arr[BDEV][synced_iter[j]]);
2515                 if (sample[j].phase_arr) as[sample[j].idx] = sample[j].phase_arr[synced_iter[j]];
2516             } else {
2517                 // if no match variant found, match the end of the contig or keep conservative
2518                 if (i == 0 && sample[j].data_arr[BDEV])
2519                     bdev[sample[j].idx] = int16_to_float(sample[j].data_arr[BDEV][0]);
2520                 if (i == 0 && sample[j].data_arr[LDEV])
2521                     ldev[sample[j].idx] = int16_to_float(sample[j].data_arr[LDEV][0]);
2522                 if (sample[j].data_arr[BDEV] && int16_to_float(sample[j].data_arr[BDEV][synced_iter[j]]) == 0.0f)
2523                     bdev[sample[j].idx] = 0.0f;
2524                 if (sample[j].data_arr[LDEV] && int16_to_float(sample[j].data_arr[LDEV][synced_iter[j]]) == 0.0f)
2525                     ldev[sample[j].idx] = 0.0f;
2526                 if (sample[j].phase_arr) as[sample[j].idx] = 0;
2527             }
2528         }
2529         if (!(model->flags & WGS_DATA)) {
2530             bcf_update_info_float(out_hdr, line, "ADJ_COEFF", model->locus_arr[i].adjust, 9);
2531         }
2532         if (!(model->flags & NO_ANNOT)) {
2533             bcf_update_format_float(out_hdr, line, "Ldev", ldev, (int)nsmpl);
2534             bcf_update_format_float(out_hdr, line, "Bdev", bdev, (int)nsmpl);
2535         }
2536         bcf_update_format_int32(out_hdr, line, "AS", as, (int)nsmpl);
2537 
2538         if (bcf_write(out_fh, out_hdr, line) < 0) error("Unable to write to output VCF file\n");
2539     }
2540 
2541     free(synced_iter);
2542     free(ldev);
2543     free(bdev);
2544     free(as);
2545 
2546     return i;
2547 }
2548 
2549 // write header
print_hdr(htsFile * out_fh,bcf_hdr_t * hdr,int argc,char * argv[],int record_cmd_line,int flags)2550 static bcf_hdr_t *print_hdr(htsFile *out_fh, bcf_hdr_t *hdr, int argc, char *argv[], int record_cmd_line, int flags) {
2551     bcf_hdr_t *out_hdr = bcf_hdr_dup(hdr);
2552     int adj_coeff_id = bcf_hdr_id2int(hdr, BCF_DT_ID, "ADJ_COEFF");
2553     if (!(flags & WGS_DATA) && !bcf_hdr_idinfo_exists(hdr, BCF_HL_INFO, adj_coeff_id))
2554         bcf_hdr_append(out_hdr,
2555                        "##INFO=<ID=ADJ_COEFF,Number=9,Type=Float,Description=\"Adjust coefficients "
2556                        "(order=AA_BAF0,AA_BAF1,AA_LRR0,AB_BAF0,AB_BAF1,AB_LRR0,BB_BAF0,BB_BAF1,BB_LRR0)"
2557                        "\">");
2558     if (!(flags & NO_ANNOT)) {
2559         int ldev_id = bcf_hdr_id2int(hdr, BCF_DT_ID, "Ldev");
2560         if (!bcf_hdr_idinfo_exists(hdr, BCF_HL_FMT, ldev_id))
2561             bcf_hdr_append(out_hdr,
2562                            "##FORMAT=<ID=Ldev,Number=1,Type=Float,Description=\"LRR deviation "
2563                            "due to chromosomal alteration\">");
2564 
2565         int bdev_id = bcf_hdr_id2int(hdr, BCF_DT_ID, "Bdev");
2566         if (!bcf_hdr_idinfo_exists(hdr, BCF_HL_FMT, bdev_id))
2567             bcf_hdr_append(out_hdr,
2568                            "##FORMAT=<ID=Bdev,Number=1,Type=Float,Description=\"BAF deviation "
2569                            "due to chromosomal alteration\">");
2570     }
2571     int as_id = bcf_hdr_id2int(hdr, BCF_DT_ID, "AS");
2572     if (!bcf_hdr_idinfo_exists(hdr, BCF_HL_FMT, as_id))
2573         bcf_hdr_append(out_hdr,
2574                        "##FORMAT=<ID=AS,Number=1,Type=Integer,Description=\"Allelic "
2575                        "shift (1/-1 if the alternate allele is over/under represented)\">");
2576     if (record_cmd_line) bcf_hdr_append_version(out_hdr, argc, argv, "bcftools_plugin");
2577     if (bcf_hdr_write(out_fh, out_hdr) < 0) error("Unable to write to output VCF file\n");
2578     return out_hdr;
2579 }
2580 
2581 // retrieve genotypes as NC, AA, AB, BB
2582 // assumes little endian architecture
bcf_get_ab_genotypes(bcf_fmt_t * fmt,int8_t * gts,int nsmpl,int allele_a,int allele_b)2583 static int bcf_get_ab_genotypes(bcf_fmt_t *fmt, int8_t *gts, int nsmpl, int allele_a, int allele_b) {
2584     if (!fmt || fmt->n != 2) return 0;
2585 
2586 #define BRANCH(type_t, bcf_type_vector_end)                                                                            \
2587     {                                                                                                                  \
2588         type_t *p = (type_t *)fmt->p;                                                                                  \
2589         for (int i = 0; i < nsmpl; i++, p += 2) {                                                                      \
2590             if (p[0] == bcf_type_vector_end || bcf_gt_is_missing(p[0]) || p[1] == bcf_type_vector_end                  \
2591                 || bcf_gt_is_missing(p[1])) {                                                                          \
2592                 gts[i] = GT_NC;                                                                                        \
2593             } else {                                                                                                   \
2594                 type_t allele_0 = bcf_gt_allele(p[0]);                                                                 \
2595                 type_t allele_1 = bcf_gt_allele(p[1]);                                                                 \
2596                 if (allele_0 == allele_a && allele_1 == allele_a)                                                      \
2597                     gts[i] = GT_AA;                                                                                    \
2598                 else if (allele_0 == allele_b && allele_1 == allele_b)                                                 \
2599                     gts[i] = GT_BB;                                                                                    \
2600                 else if ((allele_0 == allele_a && allele_1 == allele_b)                                                \
2601                          || (allele_0 == allele_b && allele_1 == allele_a))                                            \
2602                     gts[i] = GT_AB;                                                                                    \
2603                 else                                                                                                   \
2604                     return -1;                                                                                         \
2605             }                                                                                                          \
2606         }                                                                                                              \
2607     }
2608     switch (fmt->type) {
2609     case BCF_BT_INT8:
2610         BRANCH(int8_t, bcf_int8_vector_end);
2611         break;
2612     case BCF_BT_INT16:
2613         BRANCH(int16_t, bcf_int16_vector_end);
2614         break;
2615     case BCF_BT_INT32:
2616         BRANCH(int32_t, bcf_int32_vector_end);
2617         break;
2618     default:
2619         error("Unexpected type %d\n", fmt->type);
2620     }
2621 #undef BRANCH
2622 
2623     return 1;
2624 }
2625 
2626 // read one contig
get_contig(bcf_srs_t * sr,sample_t * sample,model_t * model)2627 static int get_contig(bcf_srs_t *sr, sample_t *sample, model_t *model) {
2628     int rid = model->rid;
2629     bcf_hdr_t *hdr = bcf_sr_get_header(sr, 0);
2630     bcf_sr_seek(sr, bcf_hdr_id2name(hdr, rid), 0);
2631 
2632     bcf_fmt_t *baf_fmt = NULL, *lrr_fmt = NULL;
2633     bcf_info_t *info;
2634     int nsmpl = bcf_hdr_nsamples(hdr);
2635 
2636     model->n = 0;
2637     model->n_flipped = 0;
2638     for (int j = 0; j < nsmpl; j++) sample[j].n = 0;
2639 
2640     if (!(model->flags & USE_NO_RULES_CHRS) && model->genome_rules->cen_beg[rid] == 0
2641         && model->genome_rules->cen_end[rid] == 0 && rid != model->genome_rules->mt_rid)
2642         return 0;
2643 
2644     int8_t *gts = (int8_t *)malloc(nsmpl * sizeof(int8_t));
2645     int8_t *phase_arr = (int8_t *)malloc(nsmpl * sizeof(int8_t));
2646     int *imap_arr = (int *)malloc(nsmpl * sizeof(int));
2647     int16_t *gt0 = (int16_t *)malloc(nsmpl * sizeof(int16_t));
2648     int16_t *gt1 = (int16_t *)malloc(nsmpl * sizeof(int16_t));
2649     int16_t *ad0 = (int16_t *)malloc(nsmpl * sizeof(int16_t));
2650     int16_t *ad1 = (int16_t *)malloc(nsmpl * sizeof(int16_t));
2651     int *last_het_pos = (int *)calloc(nsmpl, sizeof(int));
2652     int *last_pos = (int *)calloc(nsmpl, sizeof(int));
2653 
2654     int i;
2655     for (i = 0; bcf_sr_next_line_reader0(sr); i++) {
2656         bcf1_t *line = bcf_sr_get_line(sr, 0);
2657         if (model->filter) {
2658             int ret = filter_test(model->filter, line, NULL);
2659             if ((model->filter_logic == FLT_INCLUDE && !ret) || ret) continue;
2660         }
2661         if (rid != line->rid) break;
2662         int pos = line->pos + 1;
2663 
2664         hts_expand(locus_t, i + 1, model->m_locus, model->locus_arr);
2665 
2666         model->locus_arr[i].pos = pos;
2667 
2668         hts_expand(float, i + 1, model->m_gc, model->gc_arr);
2669         if (model->gc_id >= 0 && (info = bcf_get_info_id(line, model->gc_id)))
2670             model->gc_arr[i] = info->v1.f;
2671         else
2672             model->gc_arr[i] = NAN;
2673 
2674         // if failing inclusion/exclusion requirement, skip line
2675         if ((model->flags & FLT_EXCLUDE) && bcf_sr_get_line(sr, 1)) continue;
2676         if ((model->flags & FLT_INCLUDE) && !bcf_sr_get_line(sr, 1)) continue;
2677 
2678         // if site falls in short arm or centromere regions skip line
2679         if (!(model->flags & USE_SHORT_ARMS) && model->genome_rules->is_short_arm[rid]
2680             && pos < model->genome_rules->cen_beg[rid])
2681             continue;
2682         if (!(model->flags & USE_CENTROMERES) && pos > model->genome_rules->cen_beg[rid]
2683             && pos < model->genome_rules->cen_end[rid])
2684             continue;
2685 
2686         if (model->mhc_idx
2687             && regidx_overlap(model->mhc_idx, bcf_hdr_id2name(hdr, line->rid), line->pos, line->pos + 1, NULL))
2688             continue;
2689         if (model->kir_idx
2690             && regidx_overlap(model->kir_idx, bcf_hdr_id2name(hdr, line->rid), line->pos, line->pos + 1, NULL))
2691             continue;
2692 
2693         bcf_fmt_t *gt_fmt = bcf_get_fmt_id(line, model->gt_id);
2694         if (!bcf_get_genotype_phase(gt_fmt, phase_arr, nsmpl)) continue;
2695 
2696         // if neither AD nor LRR and BAF formats are present, skip line
2697         if (model->flags & WGS_DATA) {
2698             if (!bcf_get_unphased_genotype_alleles(gt_fmt, gt0, gt1, nsmpl)) continue;
2699             if (!bcf_get_allelic_depth(bcf_get_fmt_id(line, model->ad_id), gt0, gt1, ad0, ad1, nsmpl)) continue;
2700         } else {
2701             if ((info = bcf_get_info_id(line, model->allele_a_id)))
2702                 model->locus_arr[i].allele_a = info->v1.i;
2703             else
2704                 error("Error: ALLELE_A missing at position %s:%" PRId64 "\n", bcf_hdr_id2name(hdr, line->rid),
2705                       line->pos + 1);
2706             if ((info = bcf_get_info_id(line, model->allele_b_id)))
2707                 model->locus_arr[i].allele_b = info->v1.i;
2708             else
2709                 error("Error: ALLELE_B missing at position %s:%" PRId64 "\n", bcf_hdr_id2name(hdr, line->rid),
2710                       line->pos + 1);
2711             // missing ALLELE_A and ALLELE_B information
2712             if (model->locus_arr[i].allele_a < 0 || model->locus_arr[i].allele_b < 0) continue;
2713             // if there are no genotypes, skip line
2714             if (bcf_get_ab_genotypes(gt_fmt, gts, nsmpl, model->locus_arr[i].allele_a, model->locus_arr[i].allele_b)
2715                 < 0)
2716                 error("Error: site %s:%" PRId64 " contains non-AB alleles\n", bcf_hdr_id2name(hdr, line->rid),
2717                       line->pos + 1);
2718 
2719             if (!(lrr_fmt = bcf_get_fmt_id(line, model->lrr_id)) || !(baf_fmt = bcf_get_fmt_id(line, model->baf_id)))
2720                 continue;
2721 
2722             for (int j = 0; j < nsmpl; j++) {
2723                 if (bcf_float_is_missing(((float *)lrr_fmt->p)[sample[j].idx]))
2724                     ((float *)lrr_fmt->p)[sample[j].idx] = NAN;
2725                 if (bcf_float_is_missing(((float *)baf_fmt->p)[sample[j].idx]))
2726                     ((float *)baf_fmt->p)[sample[j].idx] = NAN;
2727             }
2728 
2729             int is_x_nonpar = rid == model->genome_rules->x_rid && pos > model->genome_rules->x_nonpar_beg
2730                               && pos < model->genome_rules->x_nonpar_end
2731                               && (pos < model->genome_rules->x_xtr_beg || pos > model->genome_rules->x_xtr_end);
2732             int is_y_or_mt = rid == model->genome_rules->y_rid || rid == model->genome_rules->mt_rid;
2733 
2734             // adjust cluster centers and slopes, inspired by
2735             // (i) Staaf, J. et al. Normalization of Illumina Infinium whole-genome
2736             // SNP data improves copy number estimates and allelic intensity ratios.
2737             // BMC Bioinformatics 9, 409 (2008) & (ii) Mayrhofer, M., Viklund, B. &
2738             // Isaksson, A. Rawcopy: Improved copy number analysis with Affymetrix
2739             // arrays. Sci Rep 6, 36158 (2016)
2740             for (int gt = GT_AA; gt <= GT_BB; gt++) {
2741                 if (is_y_or_mt) continue;
2742                 int k = 0;
2743                 for (int j = 0; j < nsmpl; j++) {
2744                     if (is_x_nonpar && sample[j].computed_gender == GENDER_MALE) continue;
2745                     if (gts[sample[j].idx] == gt) imap_arr[k++] = sample[j].idx;
2746                 }
2747                 float baf_b = 0.0f, baf_m = 0.0f, lrr_b = 0.0f;
2748                 if (model->regress_baf_lrr != -1 && model->regress_baf_lrr <= k) {
2749                     float xss = 0.0f, yss = 0.0f, xyss = 0.0f;
2750                     get_cov((float *)lrr_fmt->p, (float *)baf_fmt->p, k, imap_arr, &xss, &yss, &xyss);
2751                     baf_m = xyss / xss;
2752                     for (int j = 0; j < k; j++)
2753                         ((float *)baf_fmt->p)[imap_arr[j]] -= baf_m * ((float *)lrr_fmt->p)[imap_arr[j]];
2754                 }
2755                 if (model->adj_baf_lrr != -1 && model->adj_baf_lrr <= k) {
2756                     baf_b = get_median((float *)baf_fmt->p, k, imap_arr) - (float)(gt - 1) * 0.5f;
2757                     if (isnan(baf_b)) baf_b = 0.0f;
2758                     for (int j = 0; j < k; j++) ((float *)baf_fmt->p)[imap_arr[j]] -= baf_b;
2759                     lrr_b = get_median((float *)lrr_fmt->p, k, imap_arr);
2760                     if (isnan(lrr_b)) lrr_b = 0.0f;
2761                     for (int j = 0; j < k; j++) ((float *)lrr_fmt->p)[imap_arr[j]] -= lrr_b;
2762                 }
2763                 // corrects males on X nonPAR
2764                 if (is_x_nonpar) {
2765                     for (int j = 0; j < nsmpl; j++) {
2766                         if (sample[j].computed_gender != GENDER_MALE) continue;
2767                         if (gts[sample[j].idx] == gt) {
2768                             ((float *)baf_fmt->p)[sample[j].idx] -=
2769                                 baf_m * ((float *)lrr_fmt->p)[sample[j].idx] + baf_b;
2770                             ((float *)lrr_fmt->p)[sample[j].idx] -= lrr_b;
2771                         }
2772                     }
2773                 }
2774                 model->locus_arr[i].adjust[gt - 1][0] = baf_b;
2775                 model->locus_arr[i].adjust[gt - 1][1] = baf_m;
2776                 model->locus_arr[i].adjust[gt - 1][2] = lrr_b;
2777             }
2778 
2779             // if allele A index is bigger than allele B index flip the BAF to make
2780             // sure it refers to the highest allele
2781             if (model->locus_arr[i].allele_a > model->locus_arr[i].allele_b) {
2782                 for (int j = 0; j < nsmpl; j++)
2783                     ((float *)baf_fmt->p)[sample[j].idx] = 1.0f - ((float *)baf_fmt->p)[sample[j].idx];
2784             }
2785 
2786             for (int j = 0; j < nsmpl; j++)
2787                 if (gts[j] != GT_AB || (is_x_nonpar && sample[j].computed_gender == GENDER_MALE) || is_y_or_mt)
2788                     ((float *)baf_fmt->p)[sample[j].idx] = NAN;
2789         }
2790 
2791         // read line in memory
2792         model->n++;
2793         for (int j = 0; j < nsmpl; j++) {
2794             if (model->flags & WGS_DATA) {
2795                 if (ad0[j] == bcf_int16_missing && ad1[j] == bcf_int16_missing) continue;
2796 
2797                 // site too close to last het site or hom site too close to last site
2798                 if ((pos < last_het_pos[j] + model->min_dst)
2799                     || ((phase_arr[j] == bcf_int8_missing || phase_arr[j] == bcf_int8_vector_end)
2800                         && (pos < last_pos[j] + model->min_dst)))
2801                     continue;
2802 
2803                 // substitute the last hom site with the current het site
2804                 if (pos < last_pos[j] + model->min_dst) sample[j].n--;
2805 
2806                 if (phase_arr[j] != bcf_int8_missing || phase_arr[j] == bcf_int8_vector_end) last_het_pos[j] = pos;
2807                 last_pos[j] = pos;
2808 
2809                 sample[j].n++;
2810                 hts_expand(int, sample[j].n, sample[j].m_vcf_imap, sample[j].vcf_imap_arr);
2811                 hts_expand(int8_t, sample[j].n, sample[j].m_phase, sample[j].phase_arr);
2812                 hts_expand(int16_t, sample[j].n, sample[j].m_data[AD0], sample[j].data_arr[AD0]);
2813                 hts_expand(int16_t, sample[j].n, sample[j].m_data[AD1], sample[j].data_arr[AD1]);
2814                 sample[j].vcf_imap_arr[sample[j].n - 1] = i;
2815                 sample[j].phase_arr[sample[j].n - 1] = phase_arr[j];
2816                 sample[j].data_arr[AD0][sample[j].n - 1] = ad0[j];
2817                 sample[j].data_arr[AD1][sample[j].n - 1] = ad1[j];
2818 
2819             } else {
2820                 float lrr = ((float *)lrr_fmt->p)[sample[j].idx];
2821                 float baf = ((float *)baf_fmt->p)[sample[j].idx];
2822                 if (isnan(lrr) && isnan(baf)) continue;
2823 
2824                 sample[j].n++;
2825                 hts_expand(int, sample[j].n, sample[j].m_vcf_imap, sample[j].vcf_imap_arr);
2826                 hts_expand(int8_t, sample[j].n, sample[j].m_phase, sample[j].phase_arr);
2827                 hts_expand(int16_t, sample[j].n, sample[j].m_data[LRR], sample[j].data_arr[LRR]);
2828                 hts_expand(int16_t, sample[j].n, sample[j].m_data[BAF], sample[j].data_arr[BAF]);
2829                 sample[j].vcf_imap_arr[sample[j].n - 1] = i;
2830                 sample[j].phase_arr[sample[j].n - 1] = phase_arr[j];
2831                 sample[j].data_arr[LRR][sample[j].n - 1] = float_to_int16(lrr);
2832                 sample[j].data_arr[BAF][sample[j].n - 1] = float_to_int16(baf);
2833             }
2834         }
2835     }
2836     free(gts);
2837     free(phase_arr);
2838     free(imap_arr);
2839     free(gt0);
2840     free(gt1);
2841     free(ad0);
2842     free(ad1);
2843     free(last_het_pos);
2844     free(last_pos);
2845 
2846     return i;
2847 }
2848 
read_stats(sample_t * samples,const bcf_hdr_t * hdr,const char * fn,int lrr_gc_order,int flags)2849 static int read_stats(sample_t *samples, const bcf_hdr_t *hdr, const char *fn, int lrr_gc_order, int flags) {
2850     htsFile *fp = hts_open(fn, "r");
2851     if (fp == NULL) error("Could not open %s: %s\n", fn, strerror(errno));
2852 
2853     kstring_t str = {0, 0, NULL};
2854     if (hts_getline(fp, KS_SEP_LINE, &str) <= 0) error("Empty file: %s\n", fn);
2855     tsv_t *tsv = tsv_init_delimiter(str.s, '\t');
2856     sample_t sample;
2857     memset((void *)&sample, 0, sizeof(sample));
2858     if (tsv_register(tsv, "sample_id", tsv_read_sample_id, (void *)&sample.idx) < 0)
2859         error("File %s is missing the sample_id column\n", fn);
2860     if (tsv_register(tsv, "computed_gender", tsv_read_computed_gender, (void *)&sample.computed_gender) < 0)
2861         error("File %s is missing the computed_gender column\n", fn);
2862     if (tsv_register(tsv, "call_rate", tsv_read_float, (void *)&sample.stats.call_rate) < 0)
2863         error("File %s is missing the call_rate column\n", fn);
2864     int lrr_median = tsv_register(tsv, flags & WGS_DATA ? "cov_median" : "lrr_median", tsv_read_float,
2865                                   (void *)&sample.stats.lrr_median);
2866     int lrr_sd =
2867         tsv_register(tsv, flags & WGS_DATA ? "cov_sd" : "lrr_sd", tsv_read_float, (void *)&sample.stats.lrr_sd);
2868     if (tsv_register(tsv, flags & WGS_DATA ? "cov_auto" : "lrr_auto", tsv_read_float, (void *)&sample.stats.lrr_auto)
2869         < 0)
2870         sample.stats.lrr_auto = NAN;
2871     int baf_sd =
2872         tsv_register(tsv, flags & WGS_DATA ? "baf_corr" : "baf_sd", tsv_read_float, (void *)&sample.stats.dispersion);
2873     int baf_conc = tsv_register(tsv, "baf_conc", tsv_read_float, (void *)&sample.stats.baf_conc);
2874     if (tsv_register(tsv, "baf_auto", tsv_read_float, (void *)&sample.stats.baf_auto) < 0) sample.stats.baf_auto = NAN;
2875     tsv_register(tsv, "n_sites", tsv_read_integer, (void *)&sample.n_sites);
2876     tsv_register(tsv, "n_hets", tsv_read_integer, (void *)&sample.n_hets);
2877     tsv_register(tsv, "x_nonpar_n_hets", tsv_read_integer, (void *)&sample.x_nonpar_n_hets);
2878     tsv_register(tsv, "par1_n_hets", tsv_read_integer, (void *)&sample.par1_n_hets);
2879     tsv_register(tsv, "xtr_n_hets", tsv_read_integer, (void *)&sample.xtr_n_hets);
2880     tsv_register(tsv, "par2_n_hets", tsv_read_integer, (void *)&sample.par2_n_hets);
2881     if (tsv_register(tsv, "x_nonpar_baf_corr", tsv_read_float, (void *)&sample.x_nonpar_dispersion) < 0)
2882         sample.x_nonpar_dispersion = NAN;
2883     if (tsv_register(tsv, flags & WGS_DATA ? "x_nonpar_cov_median" : "x_nonpar_lrr_median", tsv_read_float,
2884                      (void *)&sample.x_nonpar_lrr_median)
2885         < 0)
2886         sample.x_nonpar_lrr_median = NAN;
2887     if (tsv_register(tsv, flags & WGS_DATA ? "y_nonpar_cov_median" : "y_nonpar_lrr_median", tsv_read_float,
2888                      (void *)&sample.y_nonpar_lrr_median)
2889         < 0)
2890         sample.y_nonpar_lrr_median = NAN;
2891     if (tsv_register(tsv, flags & WGS_DATA ? "mt_cov_median" : "mt_lrr_median", tsv_read_float,
2892                      (void *)&sample.mt_lrr_median)
2893         < 0)
2894         sample.mt_lrr_median = NAN;
2895     int lrr_gc_rel_ess = tsv_register(tsv, "lrr_gc_rel_ess", tsv_read_float, (void *)&sample.stats.lrr_gc_rel_ess);
2896     if (lrr_sd < 0 || lrr_gc_rel_ess < 0) sample.adjlrr_sd = NAN;
2897     int lrr_gc_coeffs = 0;
2898     for (int i = 0; i <= lrr_gc_order; i++) {
2899         str.l = 0;
2900         ksprintf(&str, "lrr_gc_%d", i);
2901         if (!(tsv_register(tsv, str.s, tsv_read_float, (void *)&sample.stats.coeffs[i]) < 0)) lrr_gc_coeffs++;
2902     }
2903     int i = 0;
2904     while (hts_getline(fp, KS_SEP_LINE, &str) > 0) {
2905         if (!tsv_parse_delimiter(tsv, (bcf1_t *)hdr, str.s, '\t')) {
2906             int idx = sample.idx;
2907             if (idx < 0) continue;
2908             if (flags & WGS_DATA) {
2909                 sample.stats.lrr_sd = sample.stats.lrr_sd / sample.stats.lrr_median;
2910                 sample.stats.lrr_median = logf(sample.stats.lrr_median);
2911                 sample.x_nonpar_lrr_median = logf(sample.x_nonpar_lrr_median);
2912                 sample.y_nonpar_lrr_median = logf(sample.y_nonpar_lrr_median);
2913                 sample.mt_lrr_median = logf(sample.mt_lrr_median);
2914             }
2915             if (!(lrr_sd < 0)) {
2916                 sample.adjlrr_sd = sample.stats.lrr_sd;
2917                 if (!(lrr_gc_rel_ess < 0)) sample.adjlrr_sd *= sqrtf(1.0f - sample.stats.lrr_gc_rel_ess);
2918             }
2919             memcpy((void *)&samples[idx], (const void *)&sample, sizeof(sample_t));
2920             i++;
2921         } else {
2922             error("Could not parse line: %s\n", str.s);
2923         }
2924     }
2925     tsv_destroy(tsv);
2926     free(str.s);
2927     hts_close(fp);
2928     if (i < bcf_hdr_nsamples(hdr)) error("File %s is missing samples\n", fn);
2929     int ret = (lrr_median < 0) || (lrr_sd < 0) || (baf_sd < 0) || (baf_conc < 0) || (lrr_gc_rel_ess < 0)
2930               || (lrr_gc_coeffs != lrr_gc_order + 1);
2931     return ret;
2932 }
2933 
2934 /*********************************
2935  * PLUGIN CODE                   *
2936  *********************************/
2937 
about(void)2938 const char *about(void) { return "Runs SNP-based method for detection of MOsaic CHromosomal Alterations (MoChA).\n"; }
2939 
usage_text(void)2940 static const char *usage_text(void) {
2941     return "\n"
2942            "About:   MOsaic CHromosomal Alterations caller, requires phased genotypes (GT)\n"
2943            "         and either B-allele frequency (BAF) and Log R Ratio intensity (LRR)\n"
2944            "         or allelic depth coverage (AD). (version " MOCHA_VERSION
2945            " https://github.com/freeseek/mocha)\n"
2946            "Usage:   bcftools +mocha [OPTIONS] <in.vcf.gz>\n"
2947            "\n"
2948            "Required options:\n"
2949            "    -g, --genome <assembly>[?]      predefined genome reference rules, 'list' to print available settings, "
2950            "append '?' for details\n"
2951            "    -G, --genome-file <file>        genome reference rules, space/tab-delimited CHROM:FROM-TO,TYPE\n"
2952            "\n"
2953            "General Options:\n"
2954            "    -v, --variants [^]<file>       tabix-indexed [compressed] VCF/BCF file containing variants\n"
2955            "    -f, --apply-filters <list>     require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n"
2956            "                                   to include (or exclude with \"^\" prefix) in the analysis\n"
2957            "    -e, --exclude <expr>           exclude sites for which the expression is true\n"
2958            "    -i, --include <expr>           select sites for which the expression is true\n"
2959            "    -r, --regions <region>         restrict to comma-separated list of regions\n"
2960            "    -R, --regions-file <file>      restrict to regions listed in a file\n"
2961            "    -t, --targets [^]<region>      restrict to comma-separated list of regions. Exclude regions with \"^\" "
2962            "prefix\n"
2963            "    -T, --targets-file [^]<file>   restrict to regions listed in a file. Exclude regions with \"^\" "
2964            "prefix\n"
2965            "    -s, --samples [^]<list>        comma separated list of samples to include (or exclude with \"^\" "
2966            "prefix)\n"
2967            "    -S, --samples-file [^]<file>   file of samples to include (or exclude with \"^\" prefix)\n"
2968            "        --force-samples            only warn about unknown subset samples\n"
2969            "        --input-stats <file>       input samples genome-wide statistics file\n"
2970            "        --only-stats               compute genome-wide statistics without detecting mosaic chromosomal "
2971            "alterations\n"
2972            "    -p  --cnp <file>               list of regions to genotype in BED format\n"
2973            "        --mhc <region>             MHC region to exclude from analysis (will be retained in the output)\n"
2974            "        --kir <region>             KIR region to exclude from analysis (will be retained in the output)\n"
2975            "        --threads <int>            number of extra output compression threads [0]\n"
2976            "\n"
2977            "Output Options:\n"
2978            "    -o, --output <file>            write output to a file [no output]\n"
2979            "    -O, --output-type <b|u|z|v>    b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: "
2980            "uncompressed VCF [v]\n"
2981            "        --no-version               do not append version and command line to the header\n"
2982            "    -a  --no-annotations           omit Ldev and Bdev FORMAT from output VCF (requires --output)\n"
2983            "        --no-log                   suppress progress report on standard error\n"
2984            "    -l  --log <file>               write log to file [standard error]\n"
2985            "    -c, --calls <file>             write chromosomal alterations calls table to a file [standard output]\n"
2986            "    -z  --stats <file>             write samples genome-wide statistics table to a file [no output]\n"
2987            "    -u, --ucsc-bed <file>          write UCSC bed track to a file [no output]\n"
2988            "\n"
2989            "HMM Options:\n"
2990            "        --bdev-LRR-BAF <list>      comma separated list of inverse BAF deviations for LRR+BAF model "
2991            "[" BDEV_LRR_BAF_DFLT
2992            "]\n"
2993            "        --bdev-BAF-phase <list>    comma separated list of inverse BAF deviations for BAF+phase model\n"
2994            "                                   [" BDEV_BAF_PHASE_DFLT
2995            "]\n"
2996            "        --min-dist <int>           minimum base pair distance between consecutive sites for WGS data "
2997            "[" MIN_DST_DFLT
2998            "]\n"
2999            "        --adjust-BAF-LRR <int>     minimum number of genotypes for a cluster to median adjust BAF and LRR "
3000            "(-1 for no adjustment) [" ADJ_BAF_LRR_DFLT
3001            "]\n"
3002            "        --regress-BAF-LRR <int>    minimum number of genotypes for a cluster to regress BAF against LRR "
3003            "(-1 for no regression) [" REGRESS_BAF_LRR_DFLT
3004            "]\n"
3005            "        --LRR-GC-order <int>       order of polynomial to regress LRR against local GC content (-1 for no "
3006            "regression) [" LRR_GC_ORDER_DFLT
3007            "]\n"
3008            "        --xy-major-pl              major transition phred-scaled likelihood [" XY_MAJOR_PL_DFLT
3009            "]\n"
3010            "        --xy-minor-pl              minor transition phred-scaled likelihood [" XY_MINOR_PL_DFLT
3011            "]\n"
3012            "        --auto-tel-pl              autosomal telomeres phred-scaled likelihood [" AUTO_TEL_PL_DFLT
3013            "]\n"
3014            "        --chrX-tel-pl              chromosome X telomeres phred-scaled likelihood [" CHRX_TEL_PL_DFLT
3015            "]\n"
3016            "        --chrY-tel-pl              chromosome Y telomeres phred-scaled likelihood [" CHRY_TEL_PL_DFLT
3017            "]\n"
3018            "        --error-pl                 uniform error phred-scaled likelihood [" ERR_PL_DFLT
3019            "]\n"
3020            "        --flip-pl                  phase flip phred-scaled likelihood [" FLIP_PL_DFLT
3021            "]\n"
3022            "        --short-arm-chrs <list>    list of chromosomes with short arms [" SHORT_ARM_CHRS_DFLT
3023            "]\n"
3024            "        --use-short-arms           use variants in short arms [FALSE]\n"
3025            "        --use-centromeres          use variants in centromeres [FALSE]\n"
3026            "        --use-no-rules-chrs        use chromosomes without centromere rules  [FALSE]\n"
3027            "        --LRR-weight <float>       relative contribution from LRR for LRR+BAF  model [" LRR_BIAS_DFLT
3028            "]\n"
3029            "        --LRR-hap2dip <float>      difference between LRR for haploid and diploid [" LRR_HAP2DIP_DFLT
3030            "]\n"
3031            "        --LRR-cutoff <float>       cutoff between LRR for haploid and diploid used to infer gender "
3032            "[estimated from X nonPAR]\n"
3033            "\n"
3034            "Examples:\n"
3035            "    bcftools +mocha -g GRCh37 -v ^exclude.bcf -p cnps.bed -c calls.tsv -z stats.tsv input.bcf\n"
3036            "    bcftools +mocha -g GRCh38 -o output.bcf -Ob -c calls.tsv -z stats.tsv --LRR-weight 0.5 input.bcf\n"
3037            "\n";
3038 }
3039 
read_list_invf(const char * str,int * n,float min,float max)3040 static float *read_list_invf(const char *str, int *n, float min, float max) {
3041     char *tmp, **list = hts_readlist(str, 0, n);
3042     if (*n >= 128) error("Cannot handle list of 128 or more parameters: %s\n", str);
3043     float *ret = (float *)malloc(*n * sizeof(float));
3044     for (int i = 0; i < *n; i++) {
3045         ret[i] = 1.0f / strtof(list[i], &tmp);
3046         if (*tmp) error("Could not parse: %s\n", list[i]);
3047         if (ret[i] < min || ret[i] > max)
3048             error("Expected values from the interval [%f,%f], found 1/%s\n", min, max, list[i]);
3049         free(list[i]);
3050     }
3051     free(list);
3052     ks_introsort_float((size_t)*n, ret);
3053     return ret;
3054 }
3055 
cnp_parse(const char * line,char ** chr_beg,char ** chr_end,uint32_t * beg,uint32_t * end,void * payload,void * usr)3056 static int cnp_parse(const char *line, char **chr_beg, char **chr_end, uint32_t *beg, uint32_t *end, void *payload,
3057                      void *usr) {
3058     // Use the standard parser for CHROM,FROM,TO
3059     int ret = regidx_parse_bed(line, chr_beg, chr_end, beg, end, NULL, NULL);
3060     if (ret != 0) return ret;
3061 
3062     // Skip the fields that were parsed above
3063     char *ss = (char *)line;
3064     while (*ss && isspace(*ss)) ss++;
3065     for (int i = 0; i < 3; i++) {
3066         while (*ss && !isspace(*ss)) ss++;
3067         if (!*ss) return -2; // wrong number of fields
3068         while (*ss && isspace(*ss)) ss++;
3069     }
3070     if (!*ss) return -2;
3071 
3072     // Parse the payload
3073     int *dat = (int *)payload;
3074     if (strncmp(ss, "DEL", 3) == 0)
3075         *dat = MOCHA_CNP_LOSS;
3076     else if (strncmp(ss, "DUP", 3) == 0)
3077         *dat = MOCHA_CNP_GAIN;
3078     else if (strncmp(ss, "CNV", 3) == 0)
3079         *dat = MOCHA_CNP_CNV;
3080     else
3081         *dat = MOCHA_UNDET;
3082     return 0;
3083 }
3084 
get_file_handle(const char * str)3085 static FILE *get_file_handle(const char *str) {
3086     FILE *ret;
3087     if (strcmp(str, "-") == 0)
3088         ret = stdout;
3089     else {
3090         ret = fopen(str, "w");
3091         if (!ret) error("Failed to open %s: %s\n", str, strerror(errno));
3092     }
3093     return ret;
3094 }
3095 
run(int argc,char * argv[])3096 int run(int argc, char *argv[]) {
3097     beta_binom_null = beta_binom_init();
3098     beta_binom_alt = beta_binom_init();
3099 
3100     // program options
3101     char *tmp = NULL;
3102     int rules_is_file = 0;
3103     int only_stats = 0;
3104     int regions_is_file = 0;
3105     int targets_is_file = 0;
3106     int sample_is_file = 0;
3107     int force_samples = 0;
3108     int output_type = FT_VCF;
3109     int n_threads = 0;
3110     int record_cmd_line = 1;
3111     const char *filter_fname = NULL;
3112     const char *filter_str = NULL;
3113     const char *regions_list = NULL;
3114     const char *targets_list = NULL;
3115     const char *sample_names = NULL;
3116     const char *output_fname = NULL;
3117     const char *cnp_fname = NULL;
3118     const char *stats_fname = NULL;
3119     const char *mhc_reg = NULL;
3120     const char *kir_reg = NULL;
3121     char *rules = NULL;
3122     sample_t *sample = NULL;
3123     FILE *log_file = stderr;
3124     FILE *out_fc = stdout;
3125     FILE *out_fz = NULL;
3126     FILE *out_fu = NULL;
3127     bcf_hdr_t *hdr = NULL;
3128     bcf_hdr_t *out_hdr = NULL;
3129     htsFile *out_fh = NULL;
3130     mocha_table_t mocha_table = {0, 0, NULL};
3131     const char *short_arm_chrs = SHORT_ARM_CHRS_DFLT;
3132     const char *bdev_lrr_baf = BDEV_LRR_BAF_DFLT;
3133     const char *bdev_baf_phase = BDEV_BAF_PHASE_DFLT;
3134 
3135     // model parameters
3136     model_t model;
3137     memset(&model, 0, sizeof(model_t));
3138     model.xy_major_log_prb = -strtof(XY_MAJOR_PL_DFLT, NULL) / 10.0f * M_LN10;
3139     model.xy_minor_log_prb = -strtof(XY_MINOR_PL_DFLT, NULL) / 10.0f * M_LN10;
3140     model.auto_tel_log_prb = -strtof(AUTO_TEL_PL_DFLT, NULL) / 10.0f * M_LN10;
3141     model.chrX_tel_log_prb = -strtof(CHRX_TEL_PL_DFLT, NULL) / 10.0f * M_LN10;
3142     model.chrY_tel_log_prb = -strtof(CHRY_TEL_PL_DFLT, NULL) / 10.0f * M_LN10;
3143     model.err_log_prb = -strtof(ERR_PL_DFLT, NULL) / 10.0f * M_LN10;
3144     model.flip_log_prb = -strtof(FLIP_PL_DFLT, NULL) / 10.0f * M_LN10;
3145     model.min_dst = (int)strtol(MIN_DST_DFLT, NULL, 0);
3146     model.lrr_bias = strtof(LRR_BIAS_DFLT, NULL);
3147     model.lrr_hap2dip = strtof(LRR_HAP2DIP_DFLT, NULL);
3148     model.lrr_cutoff = NAN;
3149     model.adj_baf_lrr = (int)strtol(ADJ_BAF_LRR_DFLT, NULL, 0);
3150     model.regress_baf_lrr = (int)strtol(REGRESS_BAF_LRR_DFLT, NULL, 0);
3151     model.lrr_gc_order = (int)strtol(LRR_GC_ORDER_DFLT, NULL, 0);
3152 
3153     // create synced reader object
3154     bcf_srs_t *sr = bcf_sr_init();
3155     bcf_sr_set_opt(sr, BCF_SR_REQUIRE_IDX);
3156 
3157     static struct option loptions[] = {{"genome", required_argument, NULL, 'g'},
3158                                        {"genome-file", required_argument, NULL, 'G'},
3159                                        {"variants", required_argument, NULL, 'v'},
3160                                        {"apply-filters", required_argument, NULL, 'f'},
3161                                        {"exclude", required_argument, NULL, 'e'},
3162                                        {"include", required_argument, NULL, 'i'},
3163                                        {"regions", required_argument, NULL, 'r'},
3164                                        {"regions-file", required_argument, NULL, 'R'},
3165                                        {"targets", required_argument, NULL, 't'},
3166                                        {"targets-file", required_argument, NULL, 'T'},
3167                                        {"samples", required_argument, NULL, 's'},
3168                                        {"samples-file", required_argument, NULL, 'S'},
3169                                        {"force-samples", no_argument, NULL, 1},
3170                                        {"cnp", required_argument, NULL, 'p'},
3171                                        {"input-stats", required_argument, NULL, 2},
3172                                        {"only-stats", no_argument, NULL, 3},
3173                                        {"mhc", required_argument, NULL, 4},
3174                                        {"kir", required_argument, NULL, 5},
3175                                        {"threads", required_argument, NULL, 9},
3176                                        {"output", required_argument, NULL, 'o'},
3177                                        {"output-type", required_argument, NULL, 'O'},
3178                                        {"no-version", no_argument, NULL, 8},
3179                                        {"no-annotations", no_argument, NULL, 'a'},
3180                                        {"calls", required_argument, NULL, 'c'},
3181                                        {"stats", required_argument, NULL, 'z'},
3182                                        {"ucsc-bed", required_argument, NULL, 'u'},
3183                                        {"log", required_argument, NULL, 'l'},
3184                                        {"no-log", no_argument, NULL, 10},
3185                                        {"bdev-LRR-BAF", required_argument, NULL, 11},
3186                                        {"bdev-BAF-phase", required_argument, NULL, 12},
3187                                        {"min-dist", required_argument, NULL, 13},
3188                                        {"adjust-BAF-LRR", required_argument, NULL, 14},
3189                                        {"regress-BAF-LRR", required_argument, NULL, 15},
3190                                        {"LRR-GC-order", required_argument, NULL, 16},
3191                                        {"xy-major-pl", required_argument, NULL, 17},
3192                                        {"xy-minor-pl", required_argument, NULL, 18},
3193                                        {"auto-tel-pl", required_argument, NULL, 19},
3194                                        {"chrX-tel-pl", required_argument, NULL, 20},
3195                                        {"chrY-tel-pl", required_argument, NULL, 21},
3196                                        {"error-pl", required_argument, NULL, 22},
3197                                        {"flip-pl", required_argument, NULL, 23},
3198                                        {"short-arm-chrs", required_argument, NULL, 24},
3199                                        {"use-short-arms", no_argument, NULL, 25},
3200                                        {"use-centromeres", no_argument, NULL, 26},
3201                                        {"use-no-rules-chrs", no_argument, NULL, 27},
3202                                        {"LRR-weight", required_argument, NULL, 28},
3203                                        {"LRR-hap2dip", required_argument, NULL, 29},
3204                                        {"LRR-cutoff", required_argument, NULL, 30},
3205                                        {NULL, 0, NULL, 0}};
3206     int c;
3207     while ((c = getopt_long(argc, argv, "h?g:G:v:f:e:i:r:R:t:T:s:S:p:o:O:al:c:z:u:", loptions, NULL)) >= 0) {
3208         switch (c) {
3209         case 'g':
3210             rules = optarg;
3211             break;
3212         case 'G':
3213             rules = optarg;
3214             rules_is_file = 1;
3215             break;
3216         case 'v':
3217             if (optarg[0] == '^') {
3218                 filter_fname = optarg + 1;
3219                 model.flags |= FLT_EXCLUDE;
3220             } else {
3221                 filter_fname = optarg;
3222                 model.flags |= FLT_INCLUDE;
3223             }
3224             break;
3225         case 'f':
3226             sr->apply_filters = optarg;
3227             break;
3228         case 'e':
3229             filter_str = optarg;
3230             model.filter_logic |= FLT_EXCLUDE;
3231             break;
3232         case 'i':
3233             filter_str = optarg;
3234             model.filter_logic |= FLT_INCLUDE;
3235             break;
3236         case 'r':
3237             regions_list = optarg;
3238             break;
3239         case 'R':
3240             regions_list = optarg;
3241             regions_is_file = 1;
3242             break;
3243         case 't':
3244             targets_list = optarg;
3245             break;
3246         case 'T':
3247             targets_list = optarg;
3248             targets_is_file = 1;
3249             break;
3250         case 's':
3251             sample_names = optarg;
3252             break;
3253         case 'S':
3254             sample_names = optarg;
3255             sample_is_file = 1;
3256             break;
3257         case 1:
3258             force_samples = 1;
3259             break;
3260         case 2:
3261             stats_fname = optarg;
3262             break;
3263         case 3:
3264             only_stats = 1;
3265             break;
3266         case 'p':
3267             cnp_fname = optarg;
3268             break;
3269         case 4:
3270             mhc_reg = optarg;
3271             break;
3272         case 5:
3273             kir_reg = optarg;
3274             break;
3275         case 9:
3276             n_threads = (int)strtol(optarg, &tmp, 0);
3277             if (*tmp) error("Could not parse: --threads %s\n", optarg);
3278             break;
3279         case 'o':
3280             output_fname = optarg;
3281             break;
3282         case 'O':
3283             switch (optarg[0]) {
3284             case 'b':
3285                 output_type = FT_BCF_GZ;
3286                 break;
3287             case 'u':
3288                 output_type = FT_BCF;
3289                 break;
3290             case 'z':
3291                 output_type = FT_VCF_GZ;
3292                 break;
3293             case 'v':
3294                 output_type = FT_VCF;
3295                 break;
3296             default:
3297                 error("The output type \"%s\" not recognised\n", optarg);
3298             };
3299             break;
3300         case 8:
3301             record_cmd_line = 0;
3302             break;
3303         case 'a':
3304             model.flags |= NO_ANNOT;
3305             break;
3306         case 'l':
3307             log_file = get_file_handle(optarg);
3308             break;
3309         case 10:
3310             model.flags |= NO_LOG;
3311             break;
3312         case 'c':
3313             out_fc = get_file_handle(optarg);
3314             break;
3315         case 'z':
3316             out_fz = get_file_handle(optarg);
3317             break;
3318         case 'u':
3319             out_fu = get_file_handle(optarg);
3320             break;
3321         case 11:
3322             bdev_lrr_baf = optarg;
3323             break;
3324         case 12:
3325             bdev_baf_phase = optarg;
3326             break;
3327         case 13:
3328             model.min_dst = (int)strtol(optarg, &tmp, 0);
3329             if (*tmp) error("Could not parse: --min-dist %s\n", optarg);
3330             break;
3331         case 14:
3332             model.adj_baf_lrr = (int)strtol(optarg, &tmp, 0);
3333             if (*tmp) error("Could not parse: --adjust-BAF-LRR %s\n", optarg);
3334             break;
3335         case 15:
3336             model.regress_baf_lrr = (int)strtol(optarg, &tmp, 0);
3337             if (*tmp) error("Could not parse: --regress-BAF-LRR %s\n", optarg);
3338             break;
3339         case 16:
3340             model.lrr_gc_order = (int)strtol(optarg, &tmp, 0);
3341             if (*tmp) error("Could not parse: --LRR-GC-order %s\n", optarg);
3342             break;
3343         case 17:
3344             model.xy_major_log_prb = -strtof(optarg, &tmp) / 10.0f * M_LN10;
3345             if (*tmp) error("Could not parse: --xy-major-pl %s\n", optarg);
3346             break;
3347         case 18:
3348             model.xy_minor_log_prb = -strtof(optarg, &tmp) / 10.0f * M_LN10;
3349             if (*tmp) error("Could not parse: --xy-minor-pl %s\n", optarg);
3350             break;
3351         case 19:
3352             model.auto_tel_log_prb = -strtof(optarg, &tmp) / 10.0f * M_LN10;
3353             if (*tmp) error("Could not parse: --auto-tel-pl %s\n", optarg);
3354             break;
3355         case 20:
3356             model.chrX_tel_log_prb = -strtof(optarg, &tmp) / 10.0f * M_LN10;
3357             if (*tmp) error("Could not parse: --chrX-tel-pl %s\n", optarg);
3358             break;
3359         case 21:
3360             model.chrY_tel_log_prb = -strtof(optarg, &tmp) / 10.0f * M_LN10;
3361             if (*tmp) error("Could not parse: --chrY-tel-pl %s\n", optarg);
3362             break;
3363         case 22:
3364             model.err_log_prb = -strtof(optarg, &tmp) / 10.0f * M_LN10;
3365             if (*tmp) error("Could not parse: --error-pl %s\n", optarg);
3366             break;
3367         case 23:
3368             model.flip_log_prb = -strtof(optarg, &tmp) / 10.0f * M_LN10;
3369             if (*tmp) error("Could not parse: --flip-pl %s\n", optarg);
3370             break;
3371         case 24:
3372             short_arm_chrs = optarg;
3373             break;
3374         case 25:
3375             model.flags |= USE_SHORT_ARMS;
3376             break;
3377         case 26:
3378             model.flags |= USE_CENTROMERES;
3379             break;
3380         case 27:
3381             model.flags |= USE_NO_RULES_CHRS;
3382             break;
3383         case 28:
3384             model.lrr_bias = strtof(optarg, &tmp);
3385             if (*tmp) error("Could not parse: --LRR-weight %s\n", optarg);
3386             break;
3387         case 29:
3388             model.lrr_hap2dip = strtof(optarg, &tmp);
3389             if (*tmp) error("Could not parse: --LRR-hap2dip %s\n", optarg);
3390             break;
3391         case 30:
3392             model.lrr_cutoff = strtof(optarg, &tmp);
3393             if (*tmp) error("Could not parse: --LRR-cutoff %s\n", optarg);
3394             break;
3395         case 'h':
3396         case '?':
3397             error("%s", usage_text());
3398             break;
3399         default:
3400             error("Unknown argument: %s\n", optarg);
3401         }
3402     }
3403 
3404     if (!rules) {
3405         fprintf(log_file, "Genome reference assembly was not specified with --genome or --genome-file\n");
3406         error("%s", usage_text());
3407     }
3408     int len = strlen(rules);
3409     if (!rules_is_file && (strncmp(rules, "list", 4) == 0 || rules[len - 1] == '?'))
3410         genome_init_alias(log_file, rules, NULL);
3411 
3412     if (model.filter_logic == (FLT_EXCLUDE | FLT_INCLUDE)) error("Only one of --include or --exclude can be given.\n");
3413 
3414     if (only_stats) {
3415         if (!out_fz) {
3416             fprintf(log_file, "Option --only-stats requires output option --stats\n");
3417             error("%s", usage_text());
3418         }
3419         if (output_fname || out_fc != stdout || out_fu) {
3420             fprintf(log_file, "Cannot use --only-stats with output options --output, --calls, or --ucsc-bed\n");
3421             error("%s", usage_text());
3422         }
3423     }
3424 
3425     if (output_fname == NULL && (model.flags & NO_ANNOT)) {
3426         fprintf(log_file, "Option --no-annotations requires option --output\n");
3427         error("%s", usage_text());
3428     }
3429 
3430     if (model.lrr_gc_order > MAX_ORDER) {
3431         fprintf(log_file, "Polynomial order must not be greater than %d: --LRR-GC-order %d\n", MAX_ORDER,
3432                 model.lrr_gc_order);
3433         error("%s", usage_text());
3434     }
3435 
3436     if (model.xy_major_log_prb > model.xy_minor_log_prb) {
3437         fprintf(log_file,
3438                 "Major transition phred-scaled likelihood should be greater than minor transition phred-scaled "
3439                 "likelihood: --xy-major-pl %.2f --xy-minor-pl %.2f\n",
3440                 -10.0f * M_LOG10E * model.xy_major_log_prb, -10.0f * M_LOG10E * model.xy_minor_log_prb);
3441         error("%s", usage_text());
3442     }
3443 
3444     if (model.xy_minor_log_prb > model.auto_tel_log_prb) {
3445         fprintf(log_file,
3446                 "Minor transition phred-scaled likelihood should be greater than autosomal telomeres phred-scaled "
3447                 "likelihood: --xy-minor-pl %.2f --auto-tel-pl %.2f\n",
3448                 -10.0f * M_LOG10E * model.xy_minor_log_prb, -10.0f * M_LOG10E * model.auto_tel_log_prb);
3449         error("%s", usage_text());
3450     }
3451 
3452     if (model.xy_minor_log_prb > model.chrX_tel_log_prb) {
3453         fprintf(log_file,
3454                 "Minor transition phred-scaled likelihood should be greater than chromosome X telomeres phred-scaled "
3455                 "likelihood: --xy-minor-pl %.2f --chrX-tel-pl %.2f\n",
3456                 -10.0f * M_LOG10E * model.xy_minor_log_prb, -10.0f * M_LOG10E * model.chrX_tel_log_prb);
3457         error("%s", usage_text());
3458     }
3459 
3460     if (model.xy_minor_log_prb > model.chrY_tel_log_prb) {
3461         fprintf(log_file,
3462                 "Minor transition phred-scaled likelihood should be greater than chromosome Y telomeres phred-scaled "
3463                 "likelihood: --xy-minor-pl %.2f --chrY-tel-pl %.2f\n",
3464                 -10.0f * M_LOG10E * model.xy_minor_log_prb, -10.0f * M_LOG10E * model.chrY_tel_log_prb);
3465         error("%s", usage_text());
3466     }
3467 
3468     if (stats_fname && !isnan(model.lrr_cutoff)) {
3469         fprintf(log_file, "Cannot use options --input-stats and --LRR-cutoff together\n");
3470         error("%s", usage_text());
3471     }
3472 
3473     // parse parameters defining hidden states
3474     model.bdev_lrr_baf = read_list_invf(bdev_lrr_baf, &model.bdev_lrr_baf_n, -0.5f, 0.25f);
3475     model.bdev_baf_phase = read_list_invf(bdev_baf_phase, &model.bdev_baf_phase_n, 0.0f, 0.5f);
3476 
3477     // read list of regions to genotype
3478     if (cnp_fname) {
3479         model.cnp_idx = regidx_init(cnp_fname, cnp_parse, NULL, sizeof(int), NULL);
3480         if (!model.cnp_idx) error("Error: failed to initialize CNP regions: --cnp %s\n", cnp_fname);
3481         model.cnp_itr = regitr_init(model.cnp_idx);
3482     }
3483     if (mhc_reg) model.mhc_idx = regidx_init_string(mhc_reg, regidx_parse_reg, NULL, 0, NULL);
3484     if (kir_reg) model.kir_idx = regidx_init_string(kir_reg, regidx_parse_reg, NULL, 0, NULL);
3485 
3486     // input VCF
3487     char *input_fname = NULL;
3488     if (optind >= argc) {
3489         if (!isatty(fileno((FILE *)stdin))) input_fname = "-";
3490     } else
3491         input_fname = argv[optind];
3492     if (!input_fname) error("%s", usage_text());
3493 
3494     // read in the regions from the command line
3495     if (regions_list) {
3496         if (bcf_sr_set_regions(sr, regions_list, regions_is_file) < 0)
3497             error("Failed to read the regions: %s\n", regions_list);
3498     }
3499     if (targets_list) {
3500         if (bcf_sr_set_targets(sr, targets_list, targets_is_file, 0) < 0)
3501             error("Failed to read the targets: %s\n", targets_list);
3502     }
3503 
3504     if (bcf_sr_set_threads(sr, n_threads) < 0) error("Failed to create threads\n");
3505     if (!bcf_sr_add_reader(sr, input_fname)) error("Failed to open %s: %s\n", input_fname, bcf_sr_strerror(sr->errnum));
3506     if (filter_fname)
3507         if (!bcf_sr_add_reader(sr, filter_fname))
3508             error("Failed to open %s: %s\n", filter_fname, bcf_sr_strerror(sr->errnum));
3509 
3510     // check whether the necessary information has been included in the VCF
3511     hdr = bcf_sr_get_header(sr, 0);
3512     if (bcf_hdr_nsamples(hdr) == 0) error("Error: input VCF file has no samples\n");
3513     if (filter_str) model.filter = filter_init(hdr, filter_str);
3514 
3515     model.allele_a_id = bcf_hdr_id2int(hdr, BCF_DT_ID, "ALLELE_A");
3516     if (!bcf_hdr_idinfo_exists(hdr, BCF_HL_INFO, model.allele_a_id)) model.allele_a_id = -1;
3517     if (model.allele_a_id >= 0 && bcf_hdr_id2type(hdr, BCF_HL_INFO, model.allele_a_id) != BCF_HT_INT)
3518         error("Error: input VCF file ALLELE_A info field is not of integer type\n");
3519 
3520     model.allele_b_id = bcf_hdr_id2int(hdr, BCF_DT_ID, "ALLELE_B");
3521     if (!bcf_hdr_idinfo_exists(hdr, BCF_HL_INFO, model.allele_a_id)) model.allele_b_id = -1;
3522     if (model.allele_b_id >= 0 && bcf_hdr_id2type(hdr, BCF_HL_INFO, model.allele_b_id) != BCF_HT_INT)
3523         error("Error: input VCF file ALLELE_B info field is not of float type\n");
3524 
3525     model.gc_id = bcf_hdr_id2int(hdr, BCF_DT_ID, "GC");
3526     if (!bcf_hdr_idinfo_exists(hdr, BCF_HL_INFO, model.gc_id)) model.gc_id = -1;
3527     if (model.lrr_gc_order > 0 && model.gc_id < 0)
3528         error(
3529             "Error: input VCF has no GC info field: use \"--LRR-GC-order 0/-1\" to disable LRR adjustment through GC "
3530             "correction\nor use bcftools +mochatools -- -t GC to infer GC content\n");
3531 
3532     model.gt_id = bcf_hdr_id2int(hdr, BCF_DT_ID, "GT");
3533     if (!bcf_hdr_idinfo_exists(hdr, BCF_HL_FMT, model.gt_id)) error("Error: input VCF file has no GT format field\n");
3534 
3535     model.ad_id = bcf_hdr_id2int(hdr, BCF_DT_ID, "AD");
3536     if (!bcf_hdr_idinfo_exists(hdr, BCF_HL_FMT, model.ad_id)) model.ad_id = -1;
3537     if (model.ad_id >= 0 && bcf_hdr_id2type(hdr, BCF_HL_FMT, model.ad_id) != BCF_HT_INT)
3538         error("Error: input VCF file AD format field is not of integer type\n");
3539     if (model.ad_id >= 0 && bcf_hdr_id2length(hdr, BCF_HL_FMT, model.ad_id) != BCF_VL_R)
3540         error("Error: input VCF file AD format field has wrong number of values\n");
3541 
3542     model.baf_id = bcf_hdr_id2int(hdr, BCF_DT_ID, "BAF");
3543     if (!bcf_hdr_idinfo_exists(hdr, BCF_HL_FMT, model.baf_id)) model.baf_id = -1;
3544     if (model.baf_id >= 0 && bcf_hdr_id2type(hdr, BCF_HL_FMT, model.baf_id) != BCF_HT_REAL)
3545         error("Error: input VCF file BAF format field is not of float type\n");
3546 
3547     model.lrr_id = bcf_hdr_id2int(hdr, BCF_DT_ID, "LRR");
3548     if (!bcf_hdr_idinfo_exists(hdr, BCF_HL_FMT, model.lrr_id)) model.lrr_id = -1;
3549     if (model.lrr_id >= 0 && bcf_hdr_id2type(hdr, BCF_HL_FMT, model.lrr_id) != BCF_HT_REAL)
3550         error("Error: input VCF file LRR format field is not of float type\n");
3551 
3552     if (model.ad_id >= 0) {
3553         model.flags |= WGS_DATA;
3554         model.lrr_hap2dip = (float)M_LN2;
3555     } else if (model.baf_id >= 0 && model.lrr_id >= 0) {
3556         if (model.allele_a_id < 0 || model.allele_b_id < 0)
3557             error(
3558                 "Error: input VCF file has no ALLELE_A or ALLELE_B info fields\nUse bcftools +mochatools -- -t "
3559                 "ALLELE_A,ALLELE_B to infer ALLELE_A and ALLELE_B\n");
3560     } else {
3561         error("Error: input VCF file must contain either the AD format field or the BAF and LRR format fields\n");
3562     }
3563 
3564     // median adjustment is necessary with sequencing counts data
3565     if ((model.lrr_gc_order < 0) && (model.flags & WGS_DATA)) model.lrr_gc_order = 0;
3566 
3567     // beginning of plugin run
3568     fprintf(log_file, "MoChA " MOCHA_VERSION " https://github.com/freeseek/mocha\n");
3569     fprintf(log_file, "Genome reference: %s\n", rules);
3570     if (sample_names) fprintf(log_file, "Samples: %s\n", sample_names);
3571     if (targets_list) fprintf(log_file, "Targets: %s\n", targets_list);
3572     if (sr->apply_filters) fprintf(log_file, "Filters: %s\n", sr->apply_filters);
3573     if (model.flags & FLT_EXCLUDE) fprintf(log_file, "Variants to exclude: %s\n", filter_fname);
3574     if (model.flags & FLT_INCLUDE) fprintf(log_file, "Variants to include: %s\n", filter_fname);
3575     if (cnp_fname) fprintf(log_file, "Regions to genotype: %s\n", cnp_fname);
3576     if (stats_fname) fprintf(log_file, "Genome-wide statistics: %s\n", stats_fname);
3577     if (mhc_reg) fprintf(log_file, "MHC region to exclude from analysis: %s\n", mhc_reg);
3578     if (kir_reg) fprintf(log_file, "KIR region to exclude from analysis: %s\n", kir_reg);
3579     fprintf(log_file, "BAF deviations for LRR+BAF model: %s\n", bdev_lrr_baf);
3580     fprintf(log_file, "BAF deviations for BAF+phase model: %s\n", bdev_baf_phase);
3581     if (model.flags & WGS_DATA) {
3582         fprintf(log_file, "Minimum base pair distance between consecutive sites: %d\n", model.min_dst);
3583     } else {
3584         fprintf(log_file, "Minimum number of genotypes for a cluster to median adjust BAF and LRR: %d\n",
3585                 model.adj_baf_lrr);
3586         fprintf(log_file, "Minimum number of genotypes for a cluster to regress BAF against LRR: %d\n",
3587                 model.regress_baf_lrr);
3588     }
3589     fprintf(log_file, "Order of polynomial in local GC content to be used to regress LRR against GC: %d\n",
3590             model.lrr_gc_order);
3591     fprintf(log_file, "Major transition phred-scaled likelihood: %.2f\n", -10.0f * M_LOG10E * model.xy_major_log_prb);
3592     fprintf(log_file, "Minor transition phred-scaled likelihood: %.2f\n", -10.0f * M_LOG10E * model.xy_minor_log_prb);
3593     fprintf(log_file, "Autosomal telomeres phred-scaled likelihood: %.2f\n",
3594             -10.0f * M_LOG10E * model.auto_tel_log_prb);
3595     fprintf(log_file, "Chromosome X telomeres phred-scaled likelihood: %.2f\n",
3596             -10.0f * M_LOG10E * model.chrX_tel_log_prb);
3597     fprintf(log_file, "Chromosome Y telomeres phred-scaled likelihood: %.2f\n",
3598             -10.0f * M_LOG10E * model.chrY_tel_log_prb);
3599     fprintf(log_file, "Uniform error phred-scaled likelihood: %.2f\n", -10.0f * M_LOG10E * model.err_log_prb);
3600     fprintf(log_file, "Phase flip phred-scaled likelihood: %.2f\n", -10.0f * M_LOG10E * model.flip_log_prb);
3601     fprintf(log_file, "List of short arms: %s\n", short_arm_chrs);
3602     fprintf(log_file, "Use variants in short arms: %s\n", model.flags & USE_SHORT_ARMS ? "TRUE" : "FALSE");
3603     fprintf(log_file, "Use variants in centromeres: %s\n", model.flags & USE_CENTROMERES ? "TRUE" : "FALSE");
3604     fprintf(log_file, "Use chromosomes without centromere rules: %s\n",
3605             model.flags & USE_NO_RULES_CHRS ? "TRUE" : "FALSE");
3606     fprintf(log_file, "Relative contribution from LRR for LRR+BAF model: %.2f\n", model.lrr_bias);
3607     fprintf(log_file, "Difference between LRR for haploid and diploid: %.2f\n", model.lrr_hap2dip);
3608 
3609     // initialize genome parameters
3610     if (rules_is_file)
3611         model.genome_rules = genome_init_file(rules, hdr);
3612     else
3613         model.genome_rules = genome_init_alias(log_file, rules, hdr);
3614     if (!(model.flags & NO_LOG)) fprintf(log_file, "Using genome assembly from %s\n", rules);
3615     readlist_short_arms(model.genome_rules, short_arm_chrs, hdr);
3616 
3617     // subset VCF file
3618     if (sample_names) {
3619         int ret = bcf_hdr_set_samples(hdr, sample_names, sample_is_file);
3620         if (ret < 0)
3621             error("Error parsing the sample list\n");
3622         else if (ret > 0) {
3623             if (force_samples)
3624                 fprintf(log_file, "Warn: sample #%d not found in the header... skipping\n", ret);
3625             else
3626                 error(
3627                     "Error: sample #%d not found in the header. Use \"--force-samples\" to "
3628                     "ignore this error\n",
3629                     ret);
3630         }
3631         if (bcf_hdr_nsamples(hdr) == 0) error("Error: subsetting has removed all samples\n");
3632     }
3633 
3634     // output VCF
3635     if (output_fname) {
3636         out_fh = hts_open(output_fname, hts_bcf_wmode(output_type));
3637         if (out_fh == NULL) error("Cannot write to \"%s\": %s\n", output_fname, strerror(errno));
3638         if (n_threads) hts_set_opt(out_fh, HTS_OPT_THREAD_POOL, sr->p);
3639         out_hdr = print_hdr(out_fh, hdr, argc, argv, record_cmd_line, model.flags);
3640     }
3641 
3642     int nsmpl = bcf_hdr_nsamples(hdr);
3643     if (nsmpl == 0) error("No samples in the VCF?\n");
3644     if (!(model.flags & NO_LOG)) fprintf(log_file, "Loading %d sample(s) from the VCF file\n", nsmpl);
3645     if (!(model.flags & WGS_DATA)) {
3646         if (model.regress_baf_lrr != -1 && (model.adj_baf_lrr == -1 || model.regress_baf_lrr < model.adj_baf_lrr))
3647             error(
3648                 "Error: median adjustment must be performed whenever regression adjustment is "
3649                 "performed\n--adjust-BAF-LRR %d --regress-BAF-LRR %d\n",
3650                 model.adj_baf_lrr, model.regress_baf_lrr);
3651         if (nsmpl < 3 * model.adj_baf_lrr)
3652             fprintf(log_file,
3653                     "Warning: --adjust-BAF-LRR %d median adjustment will not be effective with "
3654                     "only %d sample(s)\n",
3655                     model.adj_baf_lrr, nsmpl);
3656         if (nsmpl < 3 * model.regress_baf_lrr)
3657             fprintf(log_file,
3658                     "Warning: --regress-BAF-LRR %d regression will not be effective with only "
3659                     "%d sample(s)\n",
3660                     model.regress_baf_lrr, nsmpl);
3661     }
3662 
3663     sample = (sample_t *)calloc(nsmpl, sizeof(sample_t));
3664     for (int i = 0; i < nsmpl; i++) {
3665         sample[i].idx = i;
3666         sample[i].x_nonpar_lrr_median = NAN;
3667         sample[i].y_nonpar_lrr_median = NAN;
3668         sample[i].mt_lrr_median = NAN;
3669     }
3670     if (stats_fname ? read_stats(sample, hdr, stats_fname, model.lrr_gc_order, model.flags) : 1) {
3671         for (int rid = 0; rid < hdr->n[BCF_DT_CTG]; rid++) {
3672             model.rid = rid;
3673             int nret = get_contig(sr, sample, &model);
3674             if (model.n <= 0) continue;
3675             if (!(model.flags & NO_LOG))
3676                 fprintf(log_file, "Using %d out of %d read variants from contig %s\n", model.n, nret,
3677                         bcf_hdr_id2name(hdr, rid));
3678             if (model.genome_rules->length[rid] < model.locus_arr[model.n - 1].pos)
3679                 model.genome_rules->length[rid] = model.locus_arr[model.n - 1].pos;
3680             for (int j = 0; j < nsmpl; j++) sample_stats(sample + j, &model);
3681         }
3682 
3683         sample_summary(sample, nsmpl, &model, stats_fname == NULL);
3684     }
3685 
3686     int cnt[3] = {0, 0, 0};
3687     for (int i = 0; i < nsmpl; i++) cnt[sample[i].computed_gender]++;
3688     if (!(model.flags & NO_LOG))
3689         fprintf(log_file, "Estimated %d sample(s) of unknown gender, %d male(s) and %d female(s)\n",
3690                 cnt[GENDER_UNKNOWN], cnt[GENDER_MALE], cnt[GENDER_FEMALE]);
3691     mocha_print_stats(out_fz, sample, nsmpl, model.lrr_gc_order, hdr, model.flags);
3692 
3693     if (!only_stats) {
3694         for (int rid = 0; rid < hdr->n[BCF_DT_CTG]; rid++) {
3695             model.rid = rid;
3696             int nret = get_contig(sr, sample, &model);
3697             if (model.n <= 0) continue;
3698             if (!(model.flags & NO_LOG))
3699                 fprintf(log_file, "Using %d out of %d read variants from contig %s\n", model.n, nret,
3700                         bcf_hdr_id2name(hdr, rid));
3701             for (int j = 0; j < nsmpl; j++) {
3702                 if (model.cnp_idx)
3703                     regidx_overlap(model.cnp_idx, bcf_hdr_id2name(hdr, rid), 0, model.genome_rules->length[rid],
3704                                    model.cnp_itr);
3705                 sample_run(sample + j, &mocha_table, &model);
3706             }
3707 
3708             if (output_fname) {
3709                 nret = put_contig(sr, sample, &model, out_fh, out_hdr);
3710                 if (!(model.flags & NO_LOG))
3711                     fprintf(log_file, "Written %d variants for contig %s\n", nret, bcf_hdr_id2name(hdr, rid));
3712             }
3713         }
3714 
3715         // estimate LRR at common autosomal losses and gains
3716         if (!(model.flags & NO_LOG) && model.cnp_itr) {
3717             int n_cnp_loss = 0, n_cnp_gain = 0, n_rare_gain = 0, n_rare_loss = 0;
3718             float *cnp_ldev = (float *)malloc(mocha_table.n * sizeof(float));
3719             float *rare_ldev = (float *)malloc(mocha_table.n * sizeof(float));
3720             for (int i = 0; i < mocha_table.n; i++) {
3721                 if (mocha_table.a[i].rid == model.genome_rules->x_rid) continue;
3722                 if (mocha_table.a[i].type == MOCHA_CNP_LOSS && mocha_table.a[i].n_hets == 0)
3723                     cnp_ldev[n_cnp_loss++] = mocha_table.a[i].ldev;
3724                 else if (mocha_table.a[i].type == MOCHA_CNP_GAIN && mocha_table.a[i].n_hets >= 5
3725                          && mocha_table.a[i].bdev >= 0.1f)
3726                     cnp_ldev[mocha_table.n - (++n_cnp_gain)] = mocha_table.a[i].ldev;
3727                 else if (mocha_table.a[i].type == MOCHA_LOSS && mocha_table.a[i].n_hets == 0)
3728                     rare_ldev[n_rare_loss++] = mocha_table.a[i].ldev;
3729                 else if (mocha_table.a[i].type == MOCHA_GAIN && mocha_table.a[i].n_hets >= 5
3730                          && mocha_table.a[i].bdev >= 0.1f)
3731                     rare_ldev[mocha_table.n - (++n_rare_gain)] = mocha_table.a[i].ldev;
3732             }
3733             float cnp_loss_ldev = get_median(cnp_ldev, n_cnp_loss, NULL);
3734             float cnp_gain_ldev = get_median(&cnp_ldev[mocha_table.n - n_cnp_gain], n_cnp_gain, NULL);
3735             float rare_loss_ldev = get_median(rare_ldev, n_rare_loss, NULL);
3736             float rare_gain_ldev = get_median(&rare_ldev[mocha_table.n - n_rare_gain], n_rare_gain, NULL);
3737             fprintf(log_file, "Adjusted LRR at CNP losses: %.4f\n", cnp_loss_ldev);
3738             fprintf(log_file, "Adjusted LRR at CNP gains: %.4f\n", cnp_gain_ldev);
3739             fprintf(log_file, "Adjusted LRR at rare losses: %.4f\n", rare_loss_ldev);
3740             fprintf(log_file, "Adjusted LRR at rare gains: %.4f\n", rare_gain_ldev);
3741             free(cnp_ldev);
3742             free(rare_ldev);
3743         }
3744     }
3745 
3746     // clear sample data
3747     for (int j = 0; j < nsmpl; j++) {
3748         free(sample[j].vcf_imap_arr);
3749         free(sample[j].data_arr[BDEV]);
3750         free(sample[j].data_arr[LDEV]);
3751         free(sample[j].phase_arr);
3752     }
3753 
3754     // clear model data
3755     free(model.locus_arr);
3756     free(model.gc_arr);
3757     free(model.bdev_lrr_baf);
3758     free(model.bdev_baf_phase);
3759     genome_destroy(model.genome_rules);
3760 
3761     // free precomputed tables
3762     ad_to_lrr_baf(NULL, NULL, NULL, NULL, 0);
3763 
3764     // write table with mosaic chromosomal alterations (and UCSC bed track)
3765     if (!only_stats) {
3766         mocha_print_calls(out_fc, mocha_table.a, mocha_table.n, hdr, model.flags, rules, model.lrr_hap2dip);
3767         mocha_print_ucsc(out_fu, mocha_table.a, mocha_table.n, hdr);
3768     }
3769     free(mocha_table.a);
3770 
3771     // close output VCF
3772     if (output_fname) {
3773         bcf_hdr_destroy(out_hdr);
3774         hts_close(out_fh);
3775     }
3776 
3777     if (log_file != stdout && log_file != stderr) fclose(log_file);
3778 
3779     // clean up
3780     if (model.filter) filter_destroy(model.filter);
3781     if (model.cnp_idx) regidx_destroy(model.cnp_idx);
3782     if (model.cnp_itr) regitr_destroy(model.cnp_itr);
3783     bcf_sr_destroy(sr);
3784     free(sample);
3785 
3786     beta_binom_destroy(beta_binom_null);
3787     beta_binom_destroy(beta_binom_alt);
3788     return 0;
3789 }
3790