1 /* The MIT License
2 
3    Copyright (C) 2018-2021 Giulio Genovese
4 
5    Author: Giulio Genovese <giulio.genovese@gmail.com>
6 
7    Permission is hereby granted, free of charge, to any person obtaining a copy
8    of this software and associated documentation files (the "Software"), to deal
9    in the Software without restriction, including without limitation the rights
10    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11    copies of the Software, and to permit persons to whom the Software is
12    furnished to do so, subject to the following conditions:
13 
14    The above copyright notice and this permission notice shall be included in
15    all copies or substantial portions of the Software.
16 
17    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23    THE SOFTWARE.
24 
25  */
26 
27 /*
28     Computes beta-binomial likelihoods using recursively pre-calculated tables
29 */
30 
31 #ifndef __MOCHA_H__
32 #define __MOCHA_H__
33 
34 #include <htslib/kseq.h>
35 #include <htslib/ksort.h>
36 #include <htslib/vcf.h>
37 #include "tsv2vcf.h"
38 #include "bcftools.h"
39 
40 #define GENDER_UNKNOWN 0
41 #define GENDER_MALE 1
42 #define GENDER_FEMALE 2
43 
44 /****************************************
45  * TSV FUNCTIONS                        *
46  ****************************************/
47 
48 // adapted from Petr Danecek's implementation of tsv_init() in bcftools/tsv2vcf.c
tsv_init_delimiter(const char * str,char delimiter)49 static inline tsv_t *tsv_init_delimiter(const char *str, char delimiter) {
50     tsv_t *tsv = (tsv_t *)calloc(1, sizeof(tsv_t));
51     kstring_t tmp = {0, 0, 0};
52     const char *ss = str, *se = ss;
53     tsv->ncols = 0;
54     while (*ss) {
55         if (delimiter == '\0')
56             while (*se && !isspace(*se)) se++;
57         else
58             while (*se && *se != delimiter) se++;
59         tsv->ncols++;
60         tsv->cols = (tsv_col_t *)realloc(tsv->cols, sizeof(tsv_col_t) * tsv->ncols);
61         tsv->cols[tsv->ncols - 1].name = NULL;
62         tsv->cols[tsv->ncols - 1].setter = NULL;
63         tmp.l = 0;
64         kputsn(ss, se - ss, &tmp);
65         if (strcasecmp("-", tmp.s)) tsv->cols[tsv->ncols - 1].name = strdup(tmp.s);
66         if (!*se) break;
67         se++;
68         if (delimiter == '\0')
69             while (*se && isspace(*se)) se++;
70         ss = se;
71     }
72     free(tmp.s);
73     return tsv;
74 }
75 
tsv_parse_delimiter(tsv_t * tsv,bcf1_t * rec,char * str,char delimiter)76 static inline int tsv_parse_delimiter(tsv_t *tsv, bcf1_t *rec, char *str, char delimiter) {
77     int status = 0;
78     tsv->icol = 0;
79     tsv->ss = tsv->se = str;
80     while (*tsv->ss && tsv->icol < tsv->ncols) {
81         if (delimiter == '\0')
82             while (*tsv->se && !isspace(*tsv->se)) tsv->se++;
83         else
84             while (*tsv->se && *tsv->se != delimiter) tsv->se++;
85         if (tsv->cols[tsv->icol].setter) {
86             int ret = tsv->cols[tsv->icol].setter(tsv, rec, tsv->cols[tsv->icol].usr);
87             if (ret < 0) return -1;
88             status++;
89         }
90         if (*tsv->se) {
91             tsv->se++;
92             if (delimiter == '\0')
93                 while (*tsv->se && isspace(*tsv->se)) tsv->se++;
94         }
95         tsv->ss = tsv->se;
96         tsv->icol++;
97     }
98     return status ? 0 : -1;
99 }
100 
tsv_read_float(tsv_t * tsv,bcf1_t * rec,void * usr)101 int tsv_read_float(tsv_t *tsv, bcf1_t *rec, void *usr) {
102     float *single = (float *)usr;
103     char tmp = *tsv->se;
104     *tsv->se = 0;
105     char *endptr;
106     *single = (float)strtof(tsv->ss, &endptr);
107     *tsv->se = tmp;
108     return 0;
109 }
110 
tsv_read_integer(tsv_t * tsv,bcf1_t * rec,void * usr)111 int tsv_read_integer(tsv_t *tsv, bcf1_t *rec, void *usr) {
112     int *integer = (int *)usr;
113     char tmp = *tsv->se;
114     *tsv->se = 0;
115     char *endptr;
116     *integer = (int)strtol(tsv->ss, &endptr, 0);
117     if (*endptr) error("Could not parse integer %s\n", tsv->ss);
118     *tsv->se = tmp;
119     return 0;
120 }
121 
tsv_read_string(tsv_t * tsv,bcf1_t * rec,void * usr)122 int tsv_read_string(tsv_t *tsv, bcf1_t *rec, void *usr) {
123     char **str = (char **)usr;
124     if (tsv->se == tsv->ss) {
125         *str = NULL;
126     } else {
127         char tmp = *tsv->se;
128         *tsv->se = 0;
129         *str = strdup(tsv->ss);
130         *tsv->se = tmp;
131     }
132     return 0;
133 }
134 
tsv_read_sample_id(tsv_t * tsv,bcf1_t * rec,void * usr)135 int tsv_read_sample_id(tsv_t *tsv, bcf1_t *rec, void *usr) {
136     bcf_hdr_t *hdr = (bcf_hdr_t *)rec;
137     int *idx = (int *)usr;
138     char tmp = *tsv->se;
139     *tsv->se = 0;
140     *idx = bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, tsv->ss);
141     *tsv->se = tmp;
142     return 0;
143 }
144 
tsv_read_computed_gender(tsv_t * tsv,bcf1_t * rec,void * usr)145 int tsv_read_computed_gender(tsv_t *tsv, bcf1_t *rec, void *usr) {
146     int *computed_gender = (int *)usr;
147     if (toupper(*tsv->ss) == 'M') {
148         *computed_gender = GENDER_MALE;
149     } else if (toupper(*tsv->ss) == 'F') {
150         *computed_gender = GENDER_FEMALE;
151     } else if (toupper(*tsv->ss) == 'U' || toupper(*tsv->ss) == 'N') {
152         *computed_gender = GENDER_UNKNOWN;
153     } else {
154         char *endptr;
155         *computed_gender = (int)strtol(tsv->ss, &endptr, 0);
156         if (*endptr)
157             error("Could not parse gender %s\n(Acceptable values: 1/M/m = male, 2/F/f = female, 0/U/u/N/n = missing)\n",
158                   tsv->ss);
159     }
160     return 0;
161 }
162 
163 /****************************************
164  * BASIC STATISTICS FUNCTIONS           *
165  ****************************************/
166 
167 // this macro from ksort.h defines the function
168 // float ks_ksmall_float(size_t n, float arr[], size_t kk);
KSORT_INIT_GENERIC(float)169 KSORT_INIT_GENERIC(float)
170 
171 // compute the median of a vector using the ksort library (with iterator)
172 float get_median(const float *v, int n, const int *imap) {
173     if (n == 0) return NAN;
174     float tmp, *w = (float *)malloc(n * sizeof(float));
175     int j = 0;
176     for (int i = 0; i < n; i++) {
177         tmp = imap ? v[imap[i]] : v[i];
178         if (!isnan(tmp)) w[j++] = tmp;
179     }
180     if (j == 0) {
181         free(w);
182         return NAN;
183     }
184     float ret = ks_ksmall_float((size_t)j, w, (size_t)j / 2);
185     if (j % 2 == 0) ret = (ret + w[j / 2 - 1]) * 0.5f;
186     free(w);
187     return ret;
188 }
189 
190 // rho =  xyss / sqrtf(xss * yss)
191 // m = xyss / xss
192 // b = ym - m * xm;
get_cov(const float * x,const float * y,int n,const int * imap,float * xss,float * yss,float * xyss)193 static inline int get_cov(const float *x, const float *y, int n, const int *imap, float *xss, float *yss, float *xyss) {
194     if (n < 2) return -1;
195     float xm = 0.0f;
196     float ym = 0.0f;
197     for (int i = 0; i < n; i++) {
198         int idx = imap ? imap[i] : i;
199         xm += x[idx];
200         ym += y[idx];
201     }
202     xm /= n;
203     ym /= n;
204     *xss = 0.0f;
205     *yss = 0.0f;
206     *xyss = 0.0f;
207     for (int i = 0; i < n; i++) {
208         int idx = imap ? imap[i] : i;
209         float xd = x[idx] - xm;
210         float yd = y[idx] - ym;
211         *xss += xd * xd;
212         *yss += yd * yd;
213         *xyss += xd * yd;
214     }
215     return 0;
216 }
217 
218 /****************************************
219  * EXTRACT DATA FROM VCF FUNCTIONS      *
220  ****************************************/
221 
222 // retrieve unphased genotype alleles information from BCF record
223 // assumes little endian architecture
bcf_get_unphased_genotype_alleles(const bcf_fmt_t * fmt,int16_t * gt0_arr,int16_t * gt1_arr,int nsmpl)224 static inline int bcf_get_unphased_genotype_alleles(const bcf_fmt_t *fmt, int16_t *gt0_arr, int16_t *gt1_arr,
225                                                     int nsmpl) {
226     if (!fmt || fmt->n != 2) return 0;
227 
228 #define BRANCH(type_t, bcf_type_vector_end)                                                                            \
229     {                                                                                                                  \
230         type_t *p = (type_t *)fmt->p;                                                                                  \
231         for (int i = 0; i < nsmpl; i++, p += 2) {                                                                      \
232             if (p[0] == bcf_type_vector_end || bcf_gt_is_missing(p[0]) || p[1] == bcf_type_vector_end                  \
233                 || bcf_gt_is_missing(p[1])) {                                                                          \
234                 gt0_arr[i] = bcf_int16_missing;                                                                        \
235                 gt1_arr[i] = bcf_int16_missing;                                                                        \
236             } else {                                                                                                   \
237                 int swap = bcf_gt_allele(p[0]) > bcf_gt_allele(p[1]);                                                  \
238                 gt0_arr[i] = (int16_t)bcf_gt_allele(p[swap]);                                                          \
239                 gt1_arr[i] = (int16_t)bcf_gt_allele(p[!swap]);                                                         \
240             }                                                                                                          \
241         }                                                                                                              \
242     }
243     switch (fmt->type) {
244     case BCF_BT_INT8:
245         BRANCH(int8_t, bcf_int8_vector_end);
246         break;
247     case BCF_BT_INT16:
248         BRANCH(int16_t, bcf_int16_vector_end);
249         break;
250     case BCF_BT_INT32:
251         BRANCH(int32_t, bcf_int32_vector_end);
252         break;
253     default:
254         error("Unexpected type %d\n", fmt->type);
255     }
256 #undef BRANCH
257 
258     return 1;
259 }
260 
261 // retrieve phase information from BCF record
262 // bcf_int8_missing if genotype is missing
263 // bcf_int8_vector_end if phase does not apply
264 // 0 if phase is not available
265 // 1 if higher number allele received from the mother
266 // -1 if higher number allele received from the father
267 // assumes little endian architecture
bcf_get_genotype_phase(const bcf_fmt_t * fmt,int8_t * gt_phase_arr,int nsmpl)268 static inline int bcf_get_genotype_phase(const bcf_fmt_t *fmt, int8_t *gt_phase_arr, int nsmpl) {
269     // bcf_fmt_t *fmt = bcf_get_fmt_id(line, id);
270     if (!fmt || fmt->n != 2) return 0;
271 
272 #define BRANCH(type_t, bcf_type_vector_end)                                                                            \
273     {                                                                                                                  \
274         type_t *p = (type_t *)fmt->p;                                                                                  \
275         for (int i = 0; i < nsmpl; i++, p += 2) {                                                                      \
276             if (p[0] == bcf_type_vector_end || bcf_gt_is_missing(p[0]) || p[1] == bcf_type_vector_end                  \
277                 || bcf_gt_is_missing(p[1])) {                                                                          \
278                 gt_phase_arr[i] = bcf_int8_missing;                                                                    \
279             } else {                                                                                                   \
280                 type_t gt0 = bcf_gt_allele(p[0]);                                                                      \
281                 type_t gt1 = bcf_gt_allele(p[1]);                                                                      \
282                 if (gt0 == gt1)                                                                                        \
283                     gt_phase_arr[i] = bcf_int8_vector_end;                                                             \
284                 else if (!bcf_gt_is_phased(p[1]))                                                                      \
285                     gt_phase_arr[i] = 0;                                                                               \
286                 else if (gt1 > gt0)                                                                                    \
287                     gt_phase_arr[i] = 1;                                                                               \
288                 else                                                                                                   \
289                     gt_phase_arr[i] = -1;                                                                              \
290             }                                                                                                          \
291         }                                                                                                              \
292     }
293     switch (fmt->type) {
294     case BCF_BT_INT8:
295         BRANCH(int8_t, bcf_int8_vector_end);
296         break;
297     case BCF_BT_INT16:
298         BRANCH(int16_t, bcf_int16_vector_end);
299         break;
300     case BCF_BT_INT32:
301         BRANCH(int32_t, bcf_int32_vector_end);
302         break;
303     default:
304         error("Unexpected type %d\n", fmt->type);
305     }
306 #undef BRANCH
307 
308     return 1;
309 }
310 
311 // retrive allelic depth information from BCF record
312 // assumes little endian architecture
bcf_get_allelic_depth(const bcf_fmt_t * fmt,const int16_t * gt0_arr,const int16_t * gt1_arr,int16_t * ad0_arr,int16_t * ad1_arr,int nsmpl)313 static inline int bcf_get_allelic_depth(const bcf_fmt_t *fmt, const int16_t *gt0_arr, const int16_t *gt1_arr,
314                                         int16_t *ad0_arr, int16_t *ad1_arr, int nsmpl) {
315     if (!fmt) return 0;
316     int nalleles = fmt->n;
317 
318 #define BRANCH(type_t, bcf_type_vector_end, bcf_type_missing)                                                          \
319     {                                                                                                                  \
320         type_t *p = (type_t *)fmt->p;                                                                                  \
321         for (int i = 0; i < nsmpl; i++, p += nalleles) {                                                               \
322             if ((gt0_arr[i] != bcf_int16_missing && (int)gt0_arr[i] >= nalleles)                                       \
323                 || (gt1_arr[i] != bcf_int16_missing && (int)gt1_arr[i] >= nalleles))                                   \
324                 error("Error: found VCF record with GT alleles %d and %d and %d number of alleles\n", gt0_arr[i],      \
325                       gt1_arr[i], nalleles);                                                                           \
326             if (gt0_arr[i] == bcf_int16_missing || gt1_arr[i] == bcf_int16_missing                                     \
327                 || p[gt0_arr[i]] == bcf_type_vector_end || p[gt0_arr[i]] == bcf_type_missing                           \
328                 || p[gt1_arr[i]] == bcf_type_vector_end || p[gt1_arr[i]] == bcf_type_missing) {                        \
329                 ad0_arr[i] = bcf_int16_missing;                                                                        \
330                 ad1_arr[i] = bcf_int16_missing;                                                                        \
331             } else {                                                                                                   \
332                 type_t gt0 = gt0_arr[i] > 0;                                                                           \
333                 type_t gt1 = gt1_arr[i] > 0;                                                                           \
334                 if (gt0 == gt1) {                                                                                      \
335                     ad0_arr[i] = (int16_t)(gt0_arr[i] == gt1_arr[i] ? p[gt0_arr[i]] : p[gt0_arr[i]] + p[gt1_arr[i]]);  \
336                     ad1_arr[i] = bcf_int16_missing;                                                                    \
337                 } else {                                                                                               \
338                     ad0_arr[i] = (int16_t)p[gt0_arr[i]];                                                               \
339                     ad1_arr[i] = (int16_t)p[gt1_arr[i]];                                                               \
340                 }                                                                                                      \
341             }                                                                                                          \
342         }                                                                                                              \
343     }
344     switch (fmt->type) {
345     case BCF_BT_INT8:
346         BRANCH(int8_t, bcf_int8_vector_end, bcf_int8_missing);
347         break;
348     case BCF_BT_INT16:
349         BRANCH(int16_t, bcf_int16_vector_end, bcf_int16_missing);
350         break;
351     case BCF_BT_INT32:
352         BRANCH(int32_t, bcf_int32_vector_end, bcf_int32_missing);
353         break;
354     default:
355         error("Unexpected type %d\n", fmt->type);
356     }
357 #undef BRANCH
358 
359     return 1;
360 }
361 
362 #endif
363