1 /* The MIT License
2
3 Copyright (C) 2018-2021 Giulio Genovese
4
5 Author: Giulio Genovese <giulio.genovese@gmail.com>
6
7 Permission is hereby granted, free of charge, to any person obtaining a copy
8 of this software and associated documentation files (the "Software"), to deal
9 in the Software without restriction, including without limitation the rights
10 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 copies of the Software, and to permit persons to whom the Software is
12 furnished to do so, subject to the following conditions:
13
14 The above copyright notice and this permission notice shall be included in
15 all copies or substantial portions of the Software.
16
17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23 THE SOFTWARE.
24
25 */
26
27 /*
28 Computes beta-binomial likelihoods using recursively pre-calculated tables
29 */
30
31 #ifndef __MOCHA_H__
32 #define __MOCHA_H__
33
34 #include <htslib/kseq.h>
35 #include <htslib/ksort.h>
36 #include <htslib/vcf.h>
37 #include "tsv2vcf.h"
38 #include "bcftools.h"
39
40 #define GENDER_UNKNOWN 0
41 #define GENDER_MALE 1
42 #define GENDER_FEMALE 2
43
44 /****************************************
45 * TSV FUNCTIONS *
46 ****************************************/
47
48 // adapted from Petr Danecek's implementation of tsv_init() in bcftools/tsv2vcf.c
tsv_init_delimiter(const char * str,char delimiter)49 static inline tsv_t *tsv_init_delimiter(const char *str, char delimiter) {
50 tsv_t *tsv = (tsv_t *)calloc(1, sizeof(tsv_t));
51 kstring_t tmp = {0, 0, 0};
52 const char *ss = str, *se = ss;
53 tsv->ncols = 0;
54 while (*ss) {
55 if (delimiter == '\0')
56 while (*se && !isspace(*se)) se++;
57 else
58 while (*se && *se != delimiter) se++;
59 tsv->ncols++;
60 tsv->cols = (tsv_col_t *)realloc(tsv->cols, sizeof(tsv_col_t) * tsv->ncols);
61 tsv->cols[tsv->ncols - 1].name = NULL;
62 tsv->cols[tsv->ncols - 1].setter = NULL;
63 tmp.l = 0;
64 kputsn(ss, se - ss, &tmp);
65 if (strcasecmp("-", tmp.s)) tsv->cols[tsv->ncols - 1].name = strdup(tmp.s);
66 if (!*se) break;
67 se++;
68 if (delimiter == '\0')
69 while (*se && isspace(*se)) se++;
70 ss = se;
71 }
72 free(tmp.s);
73 return tsv;
74 }
75
tsv_parse_delimiter(tsv_t * tsv,bcf1_t * rec,char * str,char delimiter)76 static inline int tsv_parse_delimiter(tsv_t *tsv, bcf1_t *rec, char *str, char delimiter) {
77 int status = 0;
78 tsv->icol = 0;
79 tsv->ss = tsv->se = str;
80 while (*tsv->ss && tsv->icol < tsv->ncols) {
81 if (delimiter == '\0')
82 while (*tsv->se && !isspace(*tsv->se)) tsv->se++;
83 else
84 while (*tsv->se && *tsv->se != delimiter) tsv->se++;
85 if (tsv->cols[tsv->icol].setter) {
86 int ret = tsv->cols[tsv->icol].setter(tsv, rec, tsv->cols[tsv->icol].usr);
87 if (ret < 0) return -1;
88 status++;
89 }
90 if (*tsv->se) {
91 tsv->se++;
92 if (delimiter == '\0')
93 while (*tsv->se && isspace(*tsv->se)) tsv->se++;
94 }
95 tsv->ss = tsv->se;
96 tsv->icol++;
97 }
98 return status ? 0 : -1;
99 }
100
tsv_read_float(tsv_t * tsv,bcf1_t * rec,void * usr)101 int tsv_read_float(tsv_t *tsv, bcf1_t *rec, void *usr) {
102 float *single = (float *)usr;
103 char tmp = *tsv->se;
104 *tsv->se = 0;
105 char *endptr;
106 *single = (float)strtof(tsv->ss, &endptr);
107 *tsv->se = tmp;
108 return 0;
109 }
110
tsv_read_integer(tsv_t * tsv,bcf1_t * rec,void * usr)111 int tsv_read_integer(tsv_t *tsv, bcf1_t *rec, void *usr) {
112 int *integer = (int *)usr;
113 char tmp = *tsv->se;
114 *tsv->se = 0;
115 char *endptr;
116 *integer = (int)strtol(tsv->ss, &endptr, 0);
117 if (*endptr) error("Could not parse integer %s\n", tsv->ss);
118 *tsv->se = tmp;
119 return 0;
120 }
121
tsv_read_string(tsv_t * tsv,bcf1_t * rec,void * usr)122 int tsv_read_string(tsv_t *tsv, bcf1_t *rec, void *usr) {
123 char **str = (char **)usr;
124 if (tsv->se == tsv->ss) {
125 *str = NULL;
126 } else {
127 char tmp = *tsv->se;
128 *tsv->se = 0;
129 *str = strdup(tsv->ss);
130 *tsv->se = tmp;
131 }
132 return 0;
133 }
134
tsv_read_sample_id(tsv_t * tsv,bcf1_t * rec,void * usr)135 int tsv_read_sample_id(tsv_t *tsv, bcf1_t *rec, void *usr) {
136 bcf_hdr_t *hdr = (bcf_hdr_t *)rec;
137 int *idx = (int *)usr;
138 char tmp = *tsv->se;
139 *tsv->se = 0;
140 *idx = bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, tsv->ss);
141 *tsv->se = tmp;
142 return 0;
143 }
144
tsv_read_computed_gender(tsv_t * tsv,bcf1_t * rec,void * usr)145 int tsv_read_computed_gender(tsv_t *tsv, bcf1_t *rec, void *usr) {
146 int *computed_gender = (int *)usr;
147 if (toupper(*tsv->ss) == 'M') {
148 *computed_gender = GENDER_MALE;
149 } else if (toupper(*tsv->ss) == 'F') {
150 *computed_gender = GENDER_FEMALE;
151 } else if (toupper(*tsv->ss) == 'U' || toupper(*tsv->ss) == 'N') {
152 *computed_gender = GENDER_UNKNOWN;
153 } else {
154 char *endptr;
155 *computed_gender = (int)strtol(tsv->ss, &endptr, 0);
156 if (*endptr)
157 error("Could not parse gender %s\n(Acceptable values: 1/M/m = male, 2/F/f = female, 0/U/u/N/n = missing)\n",
158 tsv->ss);
159 }
160 return 0;
161 }
162
163 /****************************************
164 * BASIC STATISTICS FUNCTIONS *
165 ****************************************/
166
167 // this macro from ksort.h defines the function
168 // float ks_ksmall_float(size_t n, float arr[], size_t kk);
KSORT_INIT_GENERIC(float)169 KSORT_INIT_GENERIC(float)
170
171 // compute the median of a vector using the ksort library (with iterator)
172 float get_median(const float *v, int n, const int *imap) {
173 if (n == 0) return NAN;
174 float tmp, *w = (float *)malloc(n * sizeof(float));
175 int j = 0;
176 for (int i = 0; i < n; i++) {
177 tmp = imap ? v[imap[i]] : v[i];
178 if (!isnan(tmp)) w[j++] = tmp;
179 }
180 if (j == 0) {
181 free(w);
182 return NAN;
183 }
184 float ret = ks_ksmall_float((size_t)j, w, (size_t)j / 2);
185 if (j % 2 == 0) ret = (ret + w[j / 2 - 1]) * 0.5f;
186 free(w);
187 return ret;
188 }
189
190 // rho = xyss / sqrtf(xss * yss)
191 // m = xyss / xss
192 // b = ym - m * xm;
get_cov(const float * x,const float * y,int n,const int * imap,float * xss,float * yss,float * xyss)193 static inline int get_cov(const float *x, const float *y, int n, const int *imap, float *xss, float *yss, float *xyss) {
194 if (n < 2) return -1;
195 float xm = 0.0f;
196 float ym = 0.0f;
197 for (int i = 0; i < n; i++) {
198 int idx = imap ? imap[i] : i;
199 xm += x[idx];
200 ym += y[idx];
201 }
202 xm /= n;
203 ym /= n;
204 *xss = 0.0f;
205 *yss = 0.0f;
206 *xyss = 0.0f;
207 for (int i = 0; i < n; i++) {
208 int idx = imap ? imap[i] : i;
209 float xd = x[idx] - xm;
210 float yd = y[idx] - ym;
211 *xss += xd * xd;
212 *yss += yd * yd;
213 *xyss += xd * yd;
214 }
215 return 0;
216 }
217
218 /****************************************
219 * EXTRACT DATA FROM VCF FUNCTIONS *
220 ****************************************/
221
222 // retrieve unphased genotype alleles information from BCF record
223 // assumes little endian architecture
bcf_get_unphased_genotype_alleles(const bcf_fmt_t * fmt,int16_t * gt0_arr,int16_t * gt1_arr,int nsmpl)224 static inline int bcf_get_unphased_genotype_alleles(const bcf_fmt_t *fmt, int16_t *gt0_arr, int16_t *gt1_arr,
225 int nsmpl) {
226 if (!fmt || fmt->n != 2) return 0;
227
228 #define BRANCH(type_t, bcf_type_vector_end) \
229 { \
230 type_t *p = (type_t *)fmt->p; \
231 for (int i = 0; i < nsmpl; i++, p += 2) { \
232 if (p[0] == bcf_type_vector_end || bcf_gt_is_missing(p[0]) || p[1] == bcf_type_vector_end \
233 || bcf_gt_is_missing(p[1])) { \
234 gt0_arr[i] = bcf_int16_missing; \
235 gt1_arr[i] = bcf_int16_missing; \
236 } else { \
237 int swap = bcf_gt_allele(p[0]) > bcf_gt_allele(p[1]); \
238 gt0_arr[i] = (int16_t)bcf_gt_allele(p[swap]); \
239 gt1_arr[i] = (int16_t)bcf_gt_allele(p[!swap]); \
240 } \
241 } \
242 }
243 switch (fmt->type) {
244 case BCF_BT_INT8:
245 BRANCH(int8_t, bcf_int8_vector_end);
246 break;
247 case BCF_BT_INT16:
248 BRANCH(int16_t, bcf_int16_vector_end);
249 break;
250 case BCF_BT_INT32:
251 BRANCH(int32_t, bcf_int32_vector_end);
252 break;
253 default:
254 error("Unexpected type %d\n", fmt->type);
255 }
256 #undef BRANCH
257
258 return 1;
259 }
260
261 // retrieve phase information from BCF record
262 // bcf_int8_missing if genotype is missing
263 // bcf_int8_vector_end if phase does not apply
264 // 0 if phase is not available
265 // 1 if higher number allele received from the mother
266 // -1 if higher number allele received from the father
267 // assumes little endian architecture
bcf_get_genotype_phase(const bcf_fmt_t * fmt,int8_t * gt_phase_arr,int nsmpl)268 static inline int bcf_get_genotype_phase(const bcf_fmt_t *fmt, int8_t *gt_phase_arr, int nsmpl) {
269 // bcf_fmt_t *fmt = bcf_get_fmt_id(line, id);
270 if (!fmt || fmt->n != 2) return 0;
271
272 #define BRANCH(type_t, bcf_type_vector_end) \
273 { \
274 type_t *p = (type_t *)fmt->p; \
275 for (int i = 0; i < nsmpl; i++, p += 2) { \
276 if (p[0] == bcf_type_vector_end || bcf_gt_is_missing(p[0]) || p[1] == bcf_type_vector_end \
277 || bcf_gt_is_missing(p[1])) { \
278 gt_phase_arr[i] = bcf_int8_missing; \
279 } else { \
280 type_t gt0 = bcf_gt_allele(p[0]); \
281 type_t gt1 = bcf_gt_allele(p[1]); \
282 if (gt0 == gt1) \
283 gt_phase_arr[i] = bcf_int8_vector_end; \
284 else if (!bcf_gt_is_phased(p[1])) \
285 gt_phase_arr[i] = 0; \
286 else if (gt1 > gt0) \
287 gt_phase_arr[i] = 1; \
288 else \
289 gt_phase_arr[i] = -1; \
290 } \
291 } \
292 }
293 switch (fmt->type) {
294 case BCF_BT_INT8:
295 BRANCH(int8_t, bcf_int8_vector_end);
296 break;
297 case BCF_BT_INT16:
298 BRANCH(int16_t, bcf_int16_vector_end);
299 break;
300 case BCF_BT_INT32:
301 BRANCH(int32_t, bcf_int32_vector_end);
302 break;
303 default:
304 error("Unexpected type %d\n", fmt->type);
305 }
306 #undef BRANCH
307
308 return 1;
309 }
310
311 // retrive allelic depth information from BCF record
312 // assumes little endian architecture
bcf_get_allelic_depth(const bcf_fmt_t * fmt,const int16_t * gt0_arr,const int16_t * gt1_arr,int16_t * ad0_arr,int16_t * ad1_arr,int nsmpl)313 static inline int bcf_get_allelic_depth(const bcf_fmt_t *fmt, const int16_t *gt0_arr, const int16_t *gt1_arr,
314 int16_t *ad0_arr, int16_t *ad1_arr, int nsmpl) {
315 if (!fmt) return 0;
316 int nalleles = fmt->n;
317
318 #define BRANCH(type_t, bcf_type_vector_end, bcf_type_missing) \
319 { \
320 type_t *p = (type_t *)fmt->p; \
321 for (int i = 0; i < nsmpl; i++, p += nalleles) { \
322 if ((gt0_arr[i] != bcf_int16_missing && (int)gt0_arr[i] >= nalleles) \
323 || (gt1_arr[i] != bcf_int16_missing && (int)gt1_arr[i] >= nalleles)) \
324 error("Error: found VCF record with GT alleles %d and %d and %d number of alleles\n", gt0_arr[i], \
325 gt1_arr[i], nalleles); \
326 if (gt0_arr[i] == bcf_int16_missing || gt1_arr[i] == bcf_int16_missing \
327 || p[gt0_arr[i]] == bcf_type_vector_end || p[gt0_arr[i]] == bcf_type_missing \
328 || p[gt1_arr[i]] == bcf_type_vector_end || p[gt1_arr[i]] == bcf_type_missing) { \
329 ad0_arr[i] = bcf_int16_missing; \
330 ad1_arr[i] = bcf_int16_missing; \
331 } else { \
332 type_t gt0 = gt0_arr[i] > 0; \
333 type_t gt1 = gt1_arr[i] > 0; \
334 if (gt0 == gt1) { \
335 ad0_arr[i] = (int16_t)(gt0_arr[i] == gt1_arr[i] ? p[gt0_arr[i]] : p[gt0_arr[i]] + p[gt1_arr[i]]); \
336 ad1_arr[i] = bcf_int16_missing; \
337 } else { \
338 ad0_arr[i] = (int16_t)p[gt0_arr[i]]; \
339 ad1_arr[i] = (int16_t)p[gt1_arr[i]]; \
340 } \
341 } \
342 } \
343 }
344 switch (fmt->type) {
345 case BCF_BT_INT8:
346 BRANCH(int8_t, bcf_int8_vector_end, bcf_int8_missing);
347 break;
348 case BCF_BT_INT16:
349 BRANCH(int16_t, bcf_int16_vector_end, bcf_int16_missing);
350 break;
351 case BCF_BT_INT32:
352 BRANCH(int32_t, bcf_int32_vector_end, bcf_int32_missing);
353 break;
354 default:
355 error("Unexpected type %d\n", fmt->type);
356 }
357 #undef BRANCH
358
359 return 1;
360 }
361
362 #endif
363