1 /* The MIT License
2
3 Copyright (c) 2016-2021 Genome Research Ltd.
4
5 Author: Petr Danecek <pd3@sanger.ac.uk>
6
7 Permission is hereby granted, free of charge, to any person obtaining a copy
8 of this software and associated documentation files (the "Software"), to deal
9 in the Software without restriction, including without limitation the rights
10 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 copies of the Software, and to permit persons to whom the Software is
12 furnished to do so, subject to the following conditions:
13
14 The above copyright notice and this permission notice shall be included in
15 all copies or substantial portions of the Software.
16
17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23 THE SOFTWARE.
24
25 */
26 /*
27 Things that would be nice to have
28 - dynamic N_REF_PAD
29 - for stop-lost events (also in frameshifts) report the number of truncated aa's
30 - memory could be greatly reduced by indexing gff (but it is quite compact already)
31 - deletions that go beyond transcript boundaries are not checked at sequence level
32 - alloc tscript->ref in hap_finalize, introduce fa_off_beg:16,fa_off_end:16
33 - see test/csq/ENST00000573314/insertion-overlap.vcf #1476288882
34
35 Read about transcript types here
36 http://vega.sanger.ac.uk/info/about/gene_and_transcript_types.html
37 http://www.ensembl.org/info/genome/variation/predicted_data.html
38 http://www.gencodegenes.org/gencode_biotypes.html
39
40 List of supported biotypes
41 antisense
42 IG_C_gene
43 IG_D_gene
44 IG_J_gene
45 IG_LV_gene
46 IG_V_gene
47 lincRNA
48 macro_lncRNA
49 miRNA
50 misc_RNA
51 Mt_rRNA
52 Mt_tRNA
53 polymorphic_pseudogene
54 processed_transcript
55 protein_coding
56 ribozyme
57 rRNA
58 sRNA
59 scRNA
60 scaRNA
61 sense_intronic
62 sense_overlapping
63 snRNA
64 snoRNA
65 TR_C_gene
66 TR_D_gene
67 TR_J_gene
68 TR_V_gene
69
70 The gff parsing logic
71 We collect features such by combining gff lines A,B,C as follows:
72 A .. gene line with a supported biotype
73 A.ID=~/^gene:/
74
75 B .. transcript line referencing A with supported biotype
76 B.ID=~/^transcript:/ && B.Parent=~/^gene:A.ID/
77
78 C .. corresponding CDS, exon, and UTR lines:
79 C[3] in {"CDS","exon","three_prime_UTR","five_prime_UTR"} && C.Parent=~/^transcript:B.ID/
80
81 For coding biotypes ("protein_coding" or "polymorphic_pseudogene") the
82 complete chain link C -> B -> A is required. For the rest, link B -> A suffices.
83
84
85 The supported consequence types, sorted by impact:
86 splice_acceptor_variant .. end region of an intron changed (2bp at the 3' end of an intron)
87 splice_donor_variant .. start region of an intron changed (2bp at the 5' end of an intron)
88 stop_gained .. DNA sequence variant resulting in a stop codon
89 frameshift_variant .. number of inserted/deleted bases not a multiple of three, disrupted translational frame
90 stop_lost .. elongated transcript, stop codon changed
91 start_lost .. the first codon changed
92 inframe_altering .. combination of indels leading to unchanged reading frame and length
93 inframe_insertion .. inserted coding sequence, unchanged reading frame
94 inframe_deletion .. deleted coding sequence, unchanged reading frame
95 missense_variant .. amino acid (aa) change, unchanged length
96 splice_region_variant .. change within 1-3 bases of the exon or 3-8 bases of the intron
97 synonymous_variant .. DNA sequence variant resulting in no amino acid change
98 stop_retained_variant .. different stop codon
99 start_retained_variant .. start codon retained by indel realignment
100 non_coding_variant .. variant in non-coding sequence, such as RNA gene
101 5_prime_UTR_variant
102 3_prime_UTR_variant
103 intron_variant .. reported only if none of the above
104 intergenic_variant .. reported only if none of the above
105
106
107 The annotation algorithm.
108 The algorithm checks if the variant falls in a region of a supported type. The
109 search is performed in the following order, until a match is found:
110 1. idx_cds(gf_cds_t) - lookup CDS by position, create haplotypes, call consequences
111 2. idx_utr(gf_utr_t) - check UTR hits
112 3. idx_exon(gf_exon_t) - check for splice variants
113 4. idx_tscript(tscript_t) - check for intronic variants, RNAs, etc.
114
115 These regidx indexes are created by parsing a gff3 file as follows:
116 1. create the array "ftr" of all UTR, CDS, exons. This will be
117 processed later and pruned based on transcript types we want to keep.
118 In the same go, create the hash "id2tr" of transcripts to keep
119 (based on biotype) which maps from transcript_id to a transcript. At
120 the same time also build the hash "gid2gene" which maps from gene_id to
121 gf_gene_t pointer.
122
123 2. build "idx_cds", "idx_tscript", "idx_utr" and "idx_exon" indexes.
124 Use only features from "ftr" which are present in "id2tr".
125
126 3. clean data that won't be needed anymore: ftr, id2tr, gid2gene.
127
128 Data structures.
129 idx_cds, idx_utr, idx_exon, idx_tscript:
130 as described above, regidx structures for fast lookup of exons/transcripts
131 overlapping a region, the payload is a pointer to tscript.cds
132 */
133
134 #include <stdio.h>
135 #include <stdlib.h>
136 #include <assert.h>
137 #include <getopt.h>
138 #include <math.h>
139 #include <inttypes.h>
140 #include <htslib/hts.h>
141 #include <htslib/vcf.h>
142 #include <htslib/synced_bcf_reader.h>
143 #include <htslib/khash.h>
144 #include <htslib/khash_str2int.h>
145 #include <htslib/kseq.h>
146 #include <htslib/faidx.h>
147 #include <errno.h>
148 #include <unistd.h>
149 #include <ctype.h>
150 #include "bcftools.h"
151 #include "filter.h"
152 #include "regidx.h"
153 #include "kheap.h"
154 #include "smpl_ilist.h"
155 #include "rbuf.h"
156
157 #ifndef __FUNCTION__
158 # define __FUNCTION__ __func__
159 #endif
160
161 // Logic of the filters: include or exclude sites which match the filters?
162 #define FLT_INCLUDE 1
163 #define FLT_EXCLUDE 2
164
165 // Definition of splice_region, splice_acceptor and splice_donor
166 #define N_SPLICE_DONOR 2
167 #define N_SPLICE_REGION_EXON 3
168 #define N_SPLICE_REGION_INTRON 8
169
170 #define N_REF_PAD 10 // number of bases to avoid boundary effects
171
172 #define STRAND_REV 0
173 #define STRAND_FWD 1
174
175 #define TRIM_NONE 0
176 #define TRIM_5PRIME 1
177 #define TRIM_3PRIME 2
178
179 // How to treat phased/unphased genotypes
180 #define PHASE_REQUIRE 0 // --phase r
181 #define PHASE_MERGE 1 // --phase m
182 #define PHASE_AS_IS 2 // --phase a
183 #define PHASE_SKIP 3 // --phase s
184 #define PHASE_NON_REF 4 // --phase R
185 #define PHASE_DROP_GT 5 // --samples -
186
187 // Node types in the haplotype tree
188 #define HAP_CDS 0
189 #define HAP_ROOT 1
190 #define HAP_SSS 2 // start/stop/splice
191
192 #define CSQ_PRINTED_UPSTREAM (1<<0)
193 #define CSQ_SYNONYMOUS_VARIANT (1<<1)
194 #define CSQ_MISSENSE_VARIANT (1<<2)
195 #define CSQ_STOP_LOST (1<<3)
196 #define CSQ_STOP_GAINED (1<<4)
197 #define CSQ_INFRAME_DELETION (1<<5)
198 #define CSQ_INFRAME_INSERTION (1<<6)
199 #define CSQ_FRAMESHIFT_VARIANT (1<<7)
200 #define CSQ_SPLICE_ACCEPTOR (1<<8)
201 #define CSQ_SPLICE_DONOR (1<<9)
202 #define CSQ_START_LOST (1<<10)
203 #define CSQ_SPLICE_REGION (1<<11)
204 #define CSQ_STOP_RETAINED (1<<12)
205 #define CSQ_UTR5 (1<<13)
206 #define CSQ_UTR3 (1<<14)
207 #define CSQ_NON_CODING (1<<15)
208 #define CSQ_INTRON (1<<16)
209 //#define CSQ_INTERGENIC (1<<17)
210 #define CSQ_INFRAME_ALTERING (1<<18)
211 #define CSQ_UPSTREAM_STOP (1<<19) // adds * in front of the csq string
212 #define CSQ_INCOMPLETE_CDS (1<<20) // to remove START/STOP in incomplete CDS, see ENSG00000173376/synon.vcf
213 #define CSQ_CODING_SEQUENCE (1<<21) // cannot tell exactly what it is, but it does affect the coding sequence
214 #define CSQ_ELONGATION (1<<22) // symbolic insertion
215 #define CSQ_START_RETAINED (1<<23)
216
217 // Haplotype-aware consequences, printed in one vcf record only, the rest has a reference @12345
218 #define CSQ_COMPOUND (CSQ_SYNONYMOUS_VARIANT|CSQ_MISSENSE_VARIANT|CSQ_STOP_LOST|CSQ_STOP_GAINED| \
219 CSQ_INFRAME_DELETION|CSQ_INFRAME_INSERTION|CSQ_FRAMESHIFT_VARIANT| \
220 CSQ_START_LOST|CSQ_STOP_RETAINED|CSQ_INFRAME_ALTERING|CSQ_INCOMPLETE_CDS| \
221 CSQ_UPSTREAM_STOP|CSQ_START_RETAINED)
222 #define CSQ_START_STOP (CSQ_STOP_LOST|CSQ_STOP_GAINED|CSQ_STOP_RETAINED|CSQ_START_LOST|CSQ_START_RETAINED)
223
224 #define CSQ_PRN_STRAND(csq) ((csq)&CSQ_COMPOUND && !((csq)&(CSQ_SPLICE_ACCEPTOR|CSQ_SPLICE_DONOR|CSQ_SPLICE_REGION)))
225 #define CSQ_PRN_TSCRIPT (~(CSQ_INTRON|CSQ_NON_CODING))
226 #define CSQ_PRN_BIOTYPE CSQ_NON_CODING
227
228 // see kput_vcsq()
229 const char *csq_strings[] =
230 {
231 NULL,
232 "synonymous",
233 "missense",
234 "stop_lost",
235 "stop_gained",
236 "inframe_deletion",
237 "inframe_insertion",
238 "frameshift",
239 "splice_acceptor",
240 "splice_donor",
241 "start_lost",
242 "splice_region",
243 "stop_retained",
244 "5_prime_utr",
245 "3_prime_utr",
246 "non_coding",
247 "intron",
248 "intergenic",
249 "inframe_altering",
250 NULL,
251 NULL,
252 "coding_sequence",
253 "feature_elongation",
254 "start_retained"
255 };
256
257
258 // GFF line types
259 #define GFF_TSCRIPT_LINE 1
260 #define GFF_GENE_LINE 2
261
262
263 /*
264 Genomic features, for fast lookup by position to overlapping features
265 */
266 #define GF_coding_bit 6
267 #define GF_is_coding(x) ((x) & (1<<GF_coding_bit))
268 #define GF_MT_rRNA 1 // non-coding: 1, 2, ...
269 #define GF_MT_tRNA 2
270 #define GF_lincRNA 3
271 #define GF_miRNA 4
272 #define GF_MISC_RNA 5
273 #define GF_rRNA 6
274 #define GF_snRNA 7
275 #define GF_snoRNA 8
276 #define GF_PROCESSED_TRANSCRIPT 9
277 #define GF_ANTISENSE 10
278 #define GF_macro_lncRNA 11
279 #define GF_ribozyme 12
280 #define GF_sRNA 13
281 #define GF_scRNA 14
282 #define GF_scaRNA 15
283 #define GF_SENSE_INTRONIC 16
284 #define GF_SENSE_OVERLAPPING 17
285 #define GF_PSEUDOGENE 18
286 #define GF_PROCESSED_PSEUDOGENE 19
287 #define GF_ARTIFACT 20
288 #define GF_IG_PSEUDOGENE 21
289 #define GF_IG_C_PSEUDOGENE 22
290 #define GF_IG_J_PSEUDOGENE 23
291 #define GF_IG_V_PSEUDOGENE 24
292 #define GF_TR_V_PSEUDOGENE 25
293 #define GF_TR_J_PSEUDOGENE 26
294 #define GF_MT_tRNA_PSEUDOGENE 27
295 #define GF_misc_RNA_PSEUDOGENE 28
296 #define GF_miRNA_PSEUDOGENE 29
297 #define GF_RIBOZYME 30
298 #define GF_RETAINED_INTRON 31
299 #define GF_RETROTRANSPOSED 32
300 #define GF_tRNA_PSEUDOGENE 33
301 #define GF_TRANSCRIBED_PROCESSED_PSEUDOGENE 34
302 #define GF_TRANSCRIBED_UNPROCESSED_PSEUDOGENE 35
303 #define GF_TRANSCRIBED_UNITARY_PSEUDOGENE 36
304 #define GF_TRANSLATED_UNPROCESSED_PSEUDOGENE 37
305 #define GF_TRANSLATED_PROCESSED_PSEUDOGENE 38
306 #define GF_KNOWN_NCRNA 39
307 #define GF_UNITARY_PSEUDOGENE 40
308 #define GF_UNPROCESSED_PSEUDOGENE 41
309 #define GF_LRG_GENE 42
310 #define GF_3PRIME_OVERLAPPING_ncRNA 43
311 #define GF_DISRUPTED_DOMAIN 44
312 #define GF_vaultRNA 45
313 #define GF_BIDIRECTIONAL_PROMOTER_lncRNA 46
314 #define GF_AMBIGUOUS_ORF 47
315 #define GF_PROTEIN_CODING (1|(1<<GF_coding_bit)) // coding: 65, 66, ...
316 #define GF_POLYMORPHIC_PSEUDOGENE (2|(1<<GF_coding_bit))
317 #define GF_IG_C (3|(1<<GF_coding_bit))
318 #define GF_IG_D (4|(1<<GF_coding_bit))
319 #define GF_IG_J (5|(1<<GF_coding_bit))
320 #define GF_IG_LV (6|(1<<GF_coding_bit))
321 #define GF_IG_V (7|(1<<GF_coding_bit))
322 #define GF_TR_C (8|(1<<GF_coding_bit))
323 #define GF_TR_D (9|(1<<GF_coding_bit))
324 #define GF_TR_J (10|(1<<GF_coding_bit))
325 #define GF_TR_V (11|(1<<GF_coding_bit))
326 #define GF_NMD (12|(1<<GF_coding_bit))
327 #define GF_NON_STOP_DECAY (13|(1<<GF_coding_bit))
328 #define GF_CDS ((1<<(GF_coding_bit+1))+1) // special types: 129, 130, ...
329 #define GF_EXON ((1<<(GF_coding_bit+1))+2)
330 #define GF_UTR3 ((1<<(GF_coding_bit+1))+3)
331 #define GF_UTR5 ((1<<(GF_coding_bit+1))+4)
332 // GF_MAX = (1<<30)-1, see hap_node_t
333
334 typedef struct _tscript_t tscript_t;
335 typedef struct
336 {
337 tscript_t *tr; // transcript
338 uint32_t beg; // the start coordinate of the CDS (on the reference strand, 0-based)
339 uint32_t pos; // 0-based index of the first exon base within the transcript (only to
340 // update hap_node_t.sbeg in hap_init, could be calculated on the fly)
341 uint32_t len; // exon length
342 uint32_t icds:30, // exon index within the transcript
343 phase:2; // offset of the CDS
344 }
345 gf_cds_t;
346 typedef struct
347 {
348 char *name; // human readable name, e.g. ORF45
349 uint32_t iseq;
350 }
351 gf_gene_t;
352 typedef struct
353 {
354 uint32_t beg,end;
355 tscript_t *tr;
356 }
357 gf_exon_t;
358 typedef enum { prime3, prime5 } utr_t;
359 typedef struct
360 {
361 utr_t which;
362 uint32_t beg,end;
363 tscript_t *tr;
364 }
365 gf_utr_t;
366
367
368 /*
369 Structures related to VCF output:
370
371 vcsq_t
372 information required to assemble consequence lines such as "inframe_deletion|XYZ|ENST01|+|5TY>5I|121ACG>A+124TA>T"
373
374 vrec_t
375 single VCF record and csq tied to this record. (Haplotype can have multiple
376 consequences in several VCF records. Each record can have multiple consequences
377 from multiple haplotypes.)
378
379 csq_t
380 a top-level consequence tied to a haplotype
381
382 vbuf_t
383 pos2vbuf
384 VCF records with the same position clustered together for a fast lookup via pos2vbuf
385 */
386 typedef struct _vbuf_t vbuf_t;
387 typedef struct _vcsq_t vcsq_t;
388 struct _vcsq_t
389 {
390 uint32_t strand:1,
391 type:31; // one of CSQ_* types
392 uint32_t trid;
393 uint32_t vcf_ial;
394 uint32_t biotype; // one of GF_* types
395 char *gene; // gene name
396 bcf1_t *ref; // if type&CSQ_PRINTED_UPSTREAM, ref consequence "@1234"
397 kstring_t vstr; // variant string, eg 5TY>5I|121ACG>A+124TA>T
398 };
399 typedef struct
400 {
401 bcf1_t *line;
402 uint32_t *fmt_bm; // bitmask of sample consequences with first/second haplotype interleaved
403 uint32_t nfmt:4, // the bitmask size (the number of integers per sample)
404 nvcsq:28, mvcsq;
405 vcsq_t *vcsq; // there can be multiple consequences for a single VCF record
406 }
407 vrec_t;
408 typedef struct
409 {
410 uint32_t pos;
411 vrec_t *vrec; // vcf line that this csq is tied to; needed when printing haplotypes (hap_stage_vcf)
412 int idx; // 0-based index of the csq at the VCF line, for FMT/BCSQ
413 vcsq_t type;
414 }
415 csq_t;
416 struct _vbuf_t
417 {
418 vrec_t **vrec; // buffer of VCF lines with the same position
419 int n, m;
420 uint32_t keep_until; // the maximum transcript end position
421 };
422 KHASH_MAP_INIT_INT(pos2vbuf, vbuf_t*)
423
424
425 /*
426 Structures related to haplotype-aware consequences in coding regions
427
428 hap_node_t
429 node of a haplotype tree. Each transcript has one tree
430
431 tscript_t
432 despite its general name, it is intended for coding transcripts only
433
434 hap_t
435 hstack_t
436 for traversal of the haplotype tree and braking combined
437 consequences into independent parts
438 */
439 typedef struct _hap_node_t hap_node_t;
440 struct _hap_node_t
441 {
442 char *seq; // cds segment [parent_node,this_node)
443 char *var; // variant "ref>alt"
444 uint32_t type:2, // HAP_ROOT or HAP_CDS
445 csq:30; // this node's consequence
446 int dlen; // alt minus ref length: <0 del, >0 ins, 0 substitution
447 uint32_t rbeg; // variant's VCF position (0-based, inclusive)
448 int32_t rlen; // variant's rlen; alen=rlen+dlen; fake for non CDS types
449 uint32_t sbeg; // variant's position on the spliced reference transcript (0-based, inclusive, N_REF_PAD not included)
450 uint32_t icds; // which exon does this node's variant overlaps
451 hap_node_t **child, *prev; // children haplotypes and previous coding node
452 int nchild, mchild;
453 bcf1_t *cur_rec, *rec; // current VCF record and node's VCF record
454 int vcf_ial; // which VCF allele generated this node
455 uint32_t nend; // number of haplotypes ending in this node
456 int *cur_child, mcur_child; // mapping from the allele to the currently active child
457 csq_t *csq_list; // list of haplotype's consequences, broken by position (each corresponds to a VCF record)
458 int ncsq_list, mcsq_list;
459 };
460 struct _tscript_t
461 {
462 uint32_t id; // transcript id
463 uint32_t beg,end; // transcript's beg and end coordinate (ref strand, 0-based, inclusive)
464 uint32_t strand:1, // STRAND_REV or STRAND_FWD
465 ncds:31, // number of exons
466 mcds;
467 gf_cds_t **cds; // ordered list of exons
468 char *ref; // reference sequence, padded with N_REF_PAD bases on both ends
469 char *sref; // spliced reference sequence, padded with N_REF_PAD bases on both ends
470 hap_node_t *root; // root of the haplotype tree
471 hap_node_t **hap; // pointer to haplotype leaves, two for each sample
472 int nhap, nsref; // number of haplotypes and length of sref, including 2*N_REF_PAD
473 uint32_t trim:2, // complete, 5' or 3' trimmed, see TRIM_* types
474 type:30; // one of GF_* types
475 gf_gene_t *gene;
476 };
cmp_tscript(tscript_t ** a,tscript_t ** b)477 static inline int cmp_tscript(tscript_t **a, tscript_t **b)
478 {
479 return ( (*a)->end < (*b)->end ) ? 1 : 0;
480 }
481 KHEAP_INIT(trhp, tscript_t*, cmp_tscript)
482 typedef khp_trhp_t tr_heap_t;
483 typedef struct
484 {
485 hap_node_t *node; // current node
486 int ichild; // current child in the active node
487 int dlen; // total dlen, from the root to the active node
488 size_t slen; // total sequence length, from the root to the active node
489 }
490 hstack_t;
491 typedef struct
492 {
493 int mstack;
494 hstack_t *stack;
495 tscript_t *tr; // tr->ref: spliced transcript on ref strand
496 kstring_t sseq; // spliced haplotype sequence on ref strand
497 kstring_t tseq; // the variable part of translated haplotype transcript, coding strand
498 kstring_t tref; // the variable part of translated reference transcript, coding strand
499 uint32_t sbeg; // stack's sbeg, for cases first node's type is HAP_SSS
500 int upstream_stop;
501 }
502 hap_t;
503
504
505 /*
506 Helper structures, only for initialization
507
508 ftr_t
509 temporary list of all exons, CDS, UTRs
510 */
511 KHASH_MAP_INIT_INT(int2tscript, tscript_t*)
512 KHASH_MAP_INIT_INT(int2gene, gf_gene_t*)
513 typedef struct
514 {
515 int type; // GF_CDS, GF_EXON, GF_5UTR, GF_3UTR
516 uint32_t beg;
517 uint32_t end;
518 uint32_t trid;
519 uint32_t strand:1; // STRAND_REV,STRAND_FWD
520 uint32_t phase:2; // 0, 1 or 2
521 uint32_t iseq:29;
522 }
523 ftr_t;
524 /*
525 Mapping from GFF ID string (such as ENST00000450305 or Zm00001d027230_P001)
526 to integer id. To keep the memory requirements low, the original version
527 relied on IDs in the form of a string prefix and a numerical id. However,
528 it turns out that this assumption is not valid for some ensembl GFFs, see
529 for example Zea_mays.AGPv4.36.gff3.gz
530 */
531 typedef struct
532 {
533 void *str2id; // khash_str2int
534 int nstr, mstr;
535 char **str; // numeric id to string
536 }
537 id_tbl_t;
538 typedef struct
539 {
540 // all exons, CDS, UTRs
541 ftr_t *ftr;
542 int nftr, mftr;
543
544 // mapping from gene id to gf_gene_t
545 kh_int2gene_t *gid2gene;
546
547 // mapping from transcript id to tscript, for quick CDS anchoring
548 kh_int2tscript_t *id2tr;
549
550 // sequences
551 void *seq2int; // str2int hash
552 char **seq;
553 int nseq, mseq;
554
555 // ignored biotypes
556 void *ignored_biotypes;
557
558 id_tbl_t gene_ids; // temporary table for mapping between gene id (eg. Zm00001d027245) and a numeric idx
559 }
560 aux_t;
561
562 typedef struct _args_t
563 {
564 // the main regidx lookups, from chr:beg-end to overlapping features and
565 // index iterator
566 regidx_t *idx_cds, *idx_utr, *idx_exon, *idx_tscript;
567 regitr_t *itr;
568
569 // temporary structures, deleted after initializtion
570 aux_t init;
571
572 // text tab-delimited output (out) or vcf/bcf output (out_fh)
573 FILE *out;
574 htsFile *out_fh;
575
576 // vcf
577 bcf_srs_t *sr;
578 bcf_hdr_t *hdr;
579 int hdr_nsmpl; // actual number of samples in the vcf, for bcf_update_format_values()
580
581 // include or exclude sites which match the filters
582 filter_t *filter;
583 char *filter_str;
584 int filter_logic; // FLT_INCLUDE or FLT_EXCLUDE
585
586 // samples to process
587 int sample_is_file;
588 char *sample_list;
589 smpl_ilist_t *smpl;
590
591 char *outdir, **argv, *fa_fname, *gff_fname, *output_fname;
592 char *bcsq_tag;
593 int argc, output_type, clevel;
594 int phase, verbosity, local_csq, record_cmd_line;
595 int ncsq2_max, nfmt_bcsq; // maximum number of csq per site that can be accessed from FORMAT/BCSQ (*2 and 1 bit skipped to avoid BCF missing values)
596 int ncsq2_small_warned;
597 int brief_predictions;
598
599 int rid; // current chromosome
600 tr_heap_t *active_tr; // heap of active transcripts for quick flushing
601 hap_t *hap; // transcript haplotype recursion
602 vbuf_t **vcf_buf; // buffered VCF lines to annotate with CSQ and flush
603 rbuf_t vcf_rbuf; // round buffer indexes to vcf_buf
604 kh_pos2vbuf_t *pos2vbuf; // fast lookup of buffered lines by position
605 tscript_t **rm_tr; // buffer of transcripts to clean
606 int nrm_tr, mrm_tr;
607 csq_t *csq_buf; // pool of csq not managed by hap_node_t, i.e. non-CDS csqs
608 int ncsq_buf, mcsq_buf;
609 id_tbl_t tscript_ids; // mapping between transcript id (eg. Zm00001d027245_T001) and a numeric idx
610 int force; // force run under various conditions. Currently only to skip out-of-phase transcripts
611 int n_threads; // extra compression/decompression threads
612
613 faidx_t *fai;
614 kstring_t str, str2;
615 int32_t *gt_arr, mgt_arr;
616 }
617 args_t;
618
619 // AAA, AAC, ...
620 const char *gencode = "KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSS*CWCLFLF";
621 const uint8_t nt4[] =
622 {
623 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4,
624 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4,
625 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4,
626 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4,
627 4,0,4,1, 4,4,4,2, 4,4,4,4, 4,4,4,4,
628 4,4,4,4, 3,4,4,4, 4,4,4,4, 4,4,4,4,
629 4,0,4,1, 4,4,4,2, 4,4,4,4, 4,4,4,4,
630 4,4,4,4, 3
631 };
632 const uint8_t cnt4[] =
633 {
634 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4,
635 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4,
636 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4,
637 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4,
638 4,3,4,2, 4,4,4,1, 4,4,4,4, 4,4,4,4,
639 4,4,4,4, 0,4,4,4, 4,4,4,4, 4,4,4,4,
640 4,3,4,2, 4,4,4,1, 4,4,4,4, 4,4,4,4,
641 4,4,4,4, 0
642 };
643 #define dna2aa(x) gencode[ nt4[(uint8_t)(x)[0]]<<4 | nt4[(uint8_t)(x)[1]]<<2 | nt4[(uint8_t)(x)[2]] ]
644 #define cdna2aa(x) gencode[ cnt4[(uint8_t)(x)[2]]<<4 | cnt4[(uint8_t)(x)[1]]<<2 | cnt4[(uint8_t)(x)[0]] ]
645
646 static const char *gf_strings_noncoding[] =
647 {
648 "MT_rRNA", "MT_tRNA", "lincRNA", "miRNA", "misc_RNA", "rRNA", "snRNA", "snoRNA", "processed_transcript",
649 "antisense", "macro_lncRNA", "ribozyme", "sRNA", "scRNA", "scaRNA", "sense_intronic", "sense_overlapping",
650 "pseudogene", "processed_pseudogene", "artifact", "IG_pseudogene", "IG_C_pseudogene", "IG_J_pseudogene",
651 "IG_V_pseudogene", "TR_V_pseudogene", "TR_J_pseudogene", "MT_tRNA_pseudogene", "misc_RNA_pseudogene",
652 "miRNA_pseudogene", "ribozyme", "retained_intron", "retrotransposed", "Trna_pseudogene", "transcribed_processed_pseudogene",
653 "transcribed_unprocessed_pseudogene", "transcribed_unitary_pseudogene", "translated_unprocessed_pseudogene",
654 "translated_processed_pseudogene", "known_ncRNA", "unitary_pseudogene", "unprocessed_pseudogene",
655 "LRG_gene", "3_prime_overlapping_ncRNA", "disrupted_domain", "vaultRNA", "bidirectional_promoter_lncRNA", "ambiguous_orf"
656 };
657 static const char *gf_strings_coding[] = { "protein_coding", "polymorphic_pseudogene", "IG_C", "IG_D", "IG_J", "IG_LV", "IG_V", "TR_C", "TR_D", "TR_J", "TR_V", "NMD", "non_stop_decay"};
658 static const char *gf_strings_special[] = { "CDS", "exon", "3_prime_UTR", "5_prime_UTR" };
659
gf_type2gff_string(int type)660 const char *gf_type2gff_string(int type)
661 {
662 if ( !GF_is_coding(type) )
663 {
664 if ( type < (1<<GF_coding_bit) ) return gf_strings_noncoding[type-1];
665 type &= (1<<(GF_coding_bit+1)) - 1;
666 return gf_strings_special[type - 1];
667 }
668 type &= (1<<GF_coding_bit) - 1;
669 return gf_strings_coding[type - 1];
670 }
671
672 /*
673 gff parsing functions
674 */
feature_set_seq(args_t * args,char * chr_beg,char * chr_end)675 static inline int feature_set_seq(args_t *args, char *chr_beg, char *chr_end)
676 {
677 aux_t *aux = &args->init;
678 char c = chr_end[1];
679 chr_end[1] = 0;
680 int iseq;
681 if ( khash_str2int_get(aux->seq2int, chr_beg, &iseq)!=0 )
682 {
683 // check for possible mismatch in chromosome naming convention such as chrX vs X
684 char *new_chr = NULL;
685 if ( faidx_has_seq(args->fai,chr_beg) )
686 new_chr = strdup(chr_beg); // valid chr name, the same in gff and faidx
687 else
688 {
689 int len = strlen(chr_beg);
690 if ( !strncmp("chr",chr_beg,3) && len>3 )
691 new_chr = strdup(chr_beg+3); // gff has the prefix, faidx does not
692 else
693 {
694 new_chr = malloc(len+4); // gff does not have the prefix, faidx has
695 memcpy(new_chr,"chr",3);
696 memcpy(new_chr+3,chr_beg,len);
697 new_chr[len+3] = 0;
698 }
699 if ( !faidx_has_seq(args->fai,new_chr) ) // modification did not help, this sequence is not in fai
700 {
701 static int unkwn_chr_warned = 0;
702 if ( !unkwn_chr_warned && args->verbosity>0 )
703 fprintf(stderr,"Warning: GFF chromosome \"%s\" not part of the reference genome\n",chr_beg);
704 unkwn_chr_warned = 1;
705 free(new_chr);
706 new_chr = strdup(chr_beg); // use the original sequence name
707 }
708 }
709 if ( khash_str2int_get(aux->seq2int, new_chr, &iseq)!=0 )
710 {
711 hts_expand(char*, aux->nseq+1, aux->mseq, aux->seq);
712 aux->seq[aux->nseq] = new_chr;
713 iseq = khash_str2int_inc(aux->seq2int, aux->seq[aux->nseq]);
714 aux->nseq++;
715 assert( aux->nseq < 1<<29 ); // see gf_gene_t.iseq and ftr_t.iseq
716 }
717 else
718 free(new_chr);
719 }
720 chr_end[1] = c;
721 return iseq;
722 }
gff_skip(const char * line,char * ss)723 static inline char *gff_skip(const char *line, char *ss)
724 {
725 while ( *ss && *ss!='\t' ) ss++;
726 if ( !*ss ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
727 return ss+1;
728 }
gff_parse_chr(const char * line,char ** chr_beg,char ** chr_end)729 static inline void gff_parse_chr(const char *line, char **chr_beg, char **chr_end)
730 {
731 char *se = (char*) line;
732 while ( *se && *se!='\t' ) se++;
733 if ( !*se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
734 *chr_beg = (char*) line;
735 *chr_end = se-1;
736 }
gff_parse_beg_end(const char * line,char * ss,uint32_t * beg,uint32_t * end)737 static inline char *gff_parse_beg_end(const char *line, char *ss, uint32_t *beg, uint32_t *end)
738 {
739 char *se = ss;
740 *beg = strtol(ss, &se, 10) - 1;
741 if ( ss==se ) error("[%s:%d %s] Could not parse the line:\n\t%s\n\t%s\n",__FILE__,__LINE__,__FUNCTION__,line,ss);
742 ss = se+1;
743 *end = strtol(ss, &se, 10) - 1;
744 if ( ss==se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
745 return se+1;
746 }
gff_id_init(id_tbl_t * tbl)747 static void gff_id_init(id_tbl_t *tbl)
748 {
749 memset(tbl, 0, sizeof(*tbl));
750 tbl->str2id = khash_str2int_init();
751 }
gff_id_destroy(id_tbl_t * tbl)752 static void gff_id_destroy(id_tbl_t *tbl)
753 {
754 khash_str2int_destroy_free(tbl->str2id);
755 free(tbl->str);
756 }
gff_id_parse(id_tbl_t * tbl,const char * line,const char * needle,char * ss)757 static inline uint32_t gff_id_parse(id_tbl_t *tbl, const char *line, const char *needle, char *ss)
758 {
759 ss = strstr(ss,needle); // e.g. "ID=transcript:"
760 if ( !ss ) error("[%s:%d %s] Could not parse the line, \"%s\" not present: %s\n",__FILE__,__LINE__,__FUNCTION__,needle,line);
761 ss += strlen(needle);
762
763 char *se = ss;
764 while ( *se && *se!=';' && !isspace(*se) ) se++;
765 char tmp = *se;
766 *se = 0;
767
768 int id;
769 if ( khash_str2int_get(tbl->str2id, ss, &id) < 0 )
770 {
771 id = tbl->nstr++;
772 hts_expand(char*, tbl->nstr, tbl->mstr, tbl->str);
773 tbl->str[id] = strdup(ss);
774 khash_str2int_set(tbl->str2id, tbl->str[id], id);
775 }
776 *se = tmp;
777
778 return id;
779 }
gff_parse_type(char * line)780 static inline int gff_parse_type(char *line)
781 {
782 line = strstr(line,"ID=");
783 if ( !line ) return -1;
784 line += 3;
785 if ( !strncmp(line,"transcript:",11) ) return GFF_TSCRIPT_LINE;
786 else if ( !strncmp(line,"gene:",5) ) return GFF_GENE_LINE;
787 return -1;
788 }
gff_parse_biotype(char * _line)789 static inline int gff_parse_biotype(char *_line)
790 {
791 char *line = strstr(_line,"biotype=");
792 if ( !line ) return -1;
793
794 line += 8;
795 switch (*line)
796 {
797 case 'p':
798 if ( !strncmp(line,"protein_coding",14) ) return GF_PROTEIN_CODING;
799 else if ( !strncmp(line,"pseudogene",10) ) return GF_PSEUDOGENE;
800 else if ( !strncmp(line,"processed_transcript",20) ) return GF_PROCESSED_TRANSCRIPT;
801 else if ( !strncmp(line,"processed_pseudogene",20) ) return GF_PROCESSED_PSEUDOGENE;
802 else if ( !strncmp(line,"polymorphic_pseudogene",22) ) return GF_POLYMORPHIC_PSEUDOGENE;
803 break;
804 case 'a':
805 if ( !strncmp(line,"artifact",8) ) return GF_ARTIFACT;
806 else if ( !strncmp(line,"antisense",9) ) return GF_ANTISENSE;
807 else if ( !strncmp(line,"ambiguous_orf",13) ) return GF_AMBIGUOUS_ORF;
808 break;
809 case 'I':
810 if ( !strncmp(line,"IG_C_gene",9) ) return GF_IG_C;
811 else if ( !strncmp(line,"IG_D_gene",9) ) return GF_IG_D;
812 else if ( !strncmp(line,"IG_J_gene",9) ) return GF_IG_J;
813 else if ( !strncmp(line,"IG_LV_gene",10) ) return GF_IG_LV;
814 else if ( !strncmp(line,"IG_V_gene",9) ) return GF_IG_V;
815 else if ( !strncmp(line,"IG_pseudogene",13) ) return GF_IG_PSEUDOGENE;
816 else if ( !strncmp(line,"IG_C_pseudogene",15) ) return GF_IG_C_PSEUDOGENE;
817 else if ( !strncmp(line,"IG_J_pseudogene",15) ) return GF_IG_J_PSEUDOGENE;
818 else if ( !strncmp(line,"IG_V_pseudogene",15) ) return GF_IG_V_PSEUDOGENE;
819 break;
820 case 'T':
821 if ( !strncmp(line,"TR_C_gene",9) ) return GF_TR_C;
822 else if ( !strncmp(line,"TR_D_gene",9) ) return GF_TR_D;
823 else if ( !strncmp(line,"TR_J_gene",9) ) return GF_TR_J;
824 else if ( !strncmp(line,"TR_V_gene",9) ) return GF_TR_V;
825 else if ( !strncmp(line,"TR_V_pseudogene",15) ) return GF_TR_V_PSEUDOGENE;
826 else if ( !strncmp(line,"TR_J_pseudogene",15) ) return GF_TR_J_PSEUDOGENE;
827 break;
828 case 'M':
829 if ( !strncmp(line,"Mt_tRNA_pseudogene",18) ) return GF_MT_tRNA_PSEUDOGENE;
830 else if ( !strncmp(line,"Mt_tRNA",7) ) return GF_MT_tRNA;
831 else if ( !strncmp(line,"Mt_rRNA",7) ) return GF_MT_tRNA;
832 break;
833 case 'l':
834 if ( !strncmp(line,"lincRNA",7) ) return GF_lincRNA;
835 break;
836 case 'm':
837 if ( !strncmp(line,"macro_lncRNA",12) ) return GF_macro_lncRNA;
838 else if ( !strncmp(line,"misc_RNA_pseudogene",19) ) return GF_misc_RNA_PSEUDOGENE;
839 else if ( !strncmp(line,"miRNA_pseudogene",16) ) return GF_miRNA_PSEUDOGENE;
840 else if ( !strncmp(line,"miRNA",5) ) return GF_miRNA;
841 else if ( !strncmp(line,"misc_RNA",8) ) return GF_MISC_RNA;
842 break;
843 case 'r':
844 if ( !strncmp(line,"rRNA",4) ) return GF_rRNA;
845 else if ( !strncmp(line,"ribozyme",8) ) return GF_RIBOZYME;
846 else if ( !strncmp(line,"retained_intron",15) ) return GF_RETAINED_INTRON;
847 else if ( !strncmp(line,"retrotransposed",15) ) return GF_RETROTRANSPOSED;
848 break;
849 case 's':
850 if ( !strncmp(line,"snRNA",5) ) return GF_snRNA;
851 else if ( !strncmp(line,"sRNA",4) ) return GF_sRNA;
852 else if ( !strncmp(line,"scRNA",5) ) return GF_scRNA;
853 else if ( !strncmp(line,"scaRNA",6) ) return GF_scaRNA;
854 else if ( !strncmp(line,"snoRNA",6) ) return GF_snoRNA;
855 else if ( !strncmp(line,"sense_intronic",14) ) return GF_SENSE_INTRONIC;
856 else if ( !strncmp(line,"sense_overlapping",17) ) return GF_SENSE_OVERLAPPING;
857 break;
858 case 't':
859 if ( !strncmp(line,"tRNA_pseudogene",15) ) return GF_tRNA_PSEUDOGENE;
860 else if ( !strncmp(line,"transcribed_processed_pseudogene",32) ) return GF_TRANSCRIBED_PROCESSED_PSEUDOGENE;
861 else if ( !strncmp(line,"transcribed_unprocessed_pseudogene",34) ) return GF_TRANSCRIBED_UNPROCESSED_PSEUDOGENE;
862 else if ( !strncmp(line,"transcribed_unitary_pseudogene",30) ) return GF_TRANSCRIBED_UNITARY_PSEUDOGENE;
863 else if ( !strncmp(line,"translated_unprocessed_pseudogene",33) ) return GF_TRANSLATED_UNPROCESSED_PSEUDOGENE;
864 else if ( !strncmp(line,"translated_processed_pseudogene",31) ) return GF_TRANSLATED_PROCESSED_PSEUDOGENE;
865 break;
866 case 'n':
867 if ( !strncmp(line,"nonsense_mediated_decay",23) ) return GF_NMD;
868 else if ( !strncmp(line,"non_stop_decay",14) ) return GF_NON_STOP_DECAY;
869 break;
870 case 'k':
871 if ( !strncmp(line,"known_ncrna",11) ) return GF_KNOWN_NCRNA;
872 break;
873 case 'u':
874 if ( !strncmp(line,"unitary_pseudogene",18) ) return GF_UNITARY_PSEUDOGENE;
875 else if ( !strncmp(line,"unprocessed_pseudogene",22) ) return GF_UNPROCESSED_PSEUDOGENE;
876 break;
877 case 'L':
878 if ( !strncmp(line,"LRG_gene",8) ) return GF_LRG_GENE;
879 break;
880 case '3':
881 if ( !strncmp(line,"3prime_overlapping_ncRNA",24) ) return GF_3PRIME_OVERLAPPING_ncRNA;
882 break;
883 case 'd':
884 if ( !strncmp(line,"disrupted_domain",16) ) return GF_DISRUPTED_DOMAIN;
885 break;
886 case 'v':
887 if ( !strncmp(line,"vaultRNA",8) ) return GF_vaultRNA;
888 break;
889 case 'b':
890 if ( !strncmp(line,"bidirectional_promoter_lncRNA",29) ) return GF_BIDIRECTIONAL_PROMOTER_lncRNA;
891 break;
892 }
893 return 0;
894 }
gff_ignored_biotype(args_t * args,char * ss)895 static inline int gff_ignored_biotype(args_t *args, char *ss)
896 {
897 ss = strstr(ss,"biotype=");
898 if ( !ss ) return 0;
899
900 ss += 8;
901 char *se = ss, tmp;
902 while ( *se && *se!=';' ) se++;
903 tmp = *se;
904 *se = 0;
905
906 char *key = ss;
907 int n = 0;
908 if ( khash_str2int_get(args->init.ignored_biotypes, ss, &n)!=0 ) key = strdup(ss);
909 khash_str2int_set(args->init.ignored_biotypes, key, n+1);
910
911 *se = tmp;
912 return 1;
913 }
gene_init(aux_t * aux,uint32_t gene_id)914 gf_gene_t *gene_init(aux_t *aux, uint32_t gene_id)
915 {
916 khint_t k = kh_get(int2gene, aux->gid2gene, (int)gene_id);
917 gf_gene_t *gene = (k == kh_end(aux->gid2gene)) ? NULL : kh_val(aux->gid2gene, k);
918 if ( !gene )
919 {
920 gene = (gf_gene_t*) calloc(1,sizeof(gf_gene_t));
921 int ret;
922 k = kh_put(int2gene, aux->gid2gene, (int)gene_id, &ret);
923 kh_val(aux->gid2gene,k) = gene;
924 }
925 return gene;
926 }
gff_parse_transcript(args_t * args,const char * line,char * ss,ftr_t * ftr)927 void gff_parse_transcript(args_t *args, const char *line, char *ss, ftr_t *ftr)
928 {
929 aux_t *aux = &args->init;
930 int biotype = gff_parse_biotype(ss);
931 if ( biotype <= 0 )
932 {
933 if ( !gff_ignored_biotype(args, ss) && args->verbosity > 0 ) fprintf(stderr,"ignored transcript: %s\n",line);
934 return;
935 }
936
937 // create a mapping from transcript_id to gene_id
938 uint32_t trid = gff_id_parse(&args->tscript_ids, line, "ID=transcript:", ss);
939 uint32_t gene_id = gff_id_parse(&args->init.gene_ids, line, "Parent=gene:", ss);
940
941 tscript_t *tr = (tscript_t*) calloc(1,sizeof(tscript_t));
942 tr->id = trid;
943 tr->strand = ftr->strand;
944 tr->gene = gene_init(aux, gene_id);
945 tr->type = biotype;
946 tr->beg = ftr->beg;
947 tr->end = ftr->end;
948
949 khint_t k;
950 int ret;
951 k = kh_put(int2tscript, aux->id2tr, (int)trid, &ret);
952 kh_val(aux->id2tr,k) = tr;
953 }
gff_parse_gene(args_t * args,const char * line,char * ss,char * chr_beg,char * chr_end,ftr_t * ftr)954 void gff_parse_gene(args_t *args, const char *line, char *ss, char *chr_beg, char *chr_end, ftr_t *ftr)
955 {
956 int biotype = gff_parse_biotype(ss);
957 if ( biotype <= 0 )
958 {
959 if ( !gff_ignored_biotype(args, ss) && args->verbosity > 0 ) fprintf(stderr,"ignored gene: %s\n",line);
960 return;
961 }
962
963 aux_t *aux = &args->init;
964
965 // substring search for "ID=gene:ENSG00000437963"
966 uint32_t gene_id = gff_id_parse(&aux->gene_ids, line, "ID=gene:", ss);
967 gf_gene_t *gene = gene_init(aux, gene_id);
968 assert( !gene->name ); // the gene_id should be unique
969
970 gene->iseq = feature_set_seq(args, chr_beg,chr_end);
971
972 // substring search for "Name=OR4F5"
973 ss = strstr(chr_end+2,"Name=");
974 if ( ss )
975 {
976 ss += 5;
977 char *se = ss;
978 while ( *se && *se!=';' && !isspace(*se) ) se++;
979 gene->name = (char*) malloc(se-ss+1);
980 memcpy(gene->name,ss,se-ss);
981 gene->name[se-ss] = 0;
982 }
983 else
984 gene->name = strdup(aux->gene_ids.str[gene_id]); // Name=<GeneName> field is not present, use the gene ID instead
985 }
gff_parse(args_t * args,char * line,ftr_t * ftr)986 int gff_parse(args_t *args, char *line, ftr_t *ftr)
987 {
988 // - skip empty lines and commented lines
989 // - columns
990 // 1. chr
991 // 2. <skip>
992 // 3. CDS, transcript, gene, ...
993 // 4-5. beg,end
994 // 6. <skip>
995 // 7. strand
996 // 8. phase
997 // 9. Parent=transcript:ENST(\d+);ID=... etc
998
999 char *ss = line;
1000 if ( !*ss ) return -1; // skip blank lines
1001 if ( *ss=='#' ) return -1; // skip comments
1002
1003 char *chr_beg, *chr_end;
1004 gff_parse_chr(line, &chr_beg, &chr_end);
1005 ss = gff_skip(line, chr_end + 2);
1006
1007 // 3. column: is this a CDS, transcript, gene, etc.
1008 if ( !strncmp("exon\t",ss,5) ) { ftr->type = GF_EXON; ss += 5; }
1009 else if ( !strncmp("CDS\t",ss,4) ) { ftr->type = GF_CDS; ss += 4; }
1010 else if ( !strncmp("three_prime_UTR\t",ss,16) ) { ftr->type = GF_UTR3; ss += 16; }
1011 else if ( !strncmp("five_prime_UTR\t",ss,15) ) { ftr->type = GF_UTR5; ss += 15; }
1012 else
1013 {
1014 ss = gff_skip(line, ss);
1015 ss = gff_parse_beg_end(line, ss, &ftr->beg,&ftr->end);
1016 ss = gff_skip(line, ss);
1017 int type = gff_parse_type(ss);
1018 if ( type!=GFF_TSCRIPT_LINE && type!=GFF_GENE_LINE )
1019 {
1020 // we ignore these, debug print to see new types:
1021 ss = strstr(ss,"ID=");
1022 if ( !ss ) return -1; // no ID, ignore the line
1023 if ( !strncmp("chromosome",ss+3,10) ) return -1;
1024 if ( !strncmp("supercontig",ss+3,11) ) return -1;
1025 if ( args->verbosity > 0 ) fprintf(stderr,"ignored: %s\n", line);
1026 return -1;
1027 }
1028
1029 // 7. column: strand
1030 if ( *ss == '+' ) ftr->strand = STRAND_FWD;
1031 else if ( *ss == '-' ) ftr->strand = STRAND_REV;
1032 else error("Unknown strand: %c .. %s\n", *ss,ss);
1033
1034 if ( type==GFF_TSCRIPT_LINE )
1035 gff_parse_transcript(args, line, ss, ftr);
1036 else
1037 gff_parse_gene(args, line, ss, chr_beg, chr_end, ftr);
1038
1039 return -1;
1040 }
1041 ss = gff_parse_beg_end(line, ss, &ftr->beg,&ftr->end);
1042 ss = gff_skip(line, ss);
1043
1044 // 7. column: strand
1045 if ( *ss == '+' ) ftr->strand = STRAND_FWD;
1046 else if ( *ss == '-' ) ftr->strand = STRAND_REV;
1047 else { if ( args->verbosity > 0 ) fprintf(stderr,"Skipping unknown strand: %c\n", *ss); return -1; }
1048 ss += 2;
1049
1050 // 8. column: phase (codon offset)
1051 if ( *ss == '0' ) ftr->phase = 0;
1052 else if ( *ss == '1' ) ftr->phase = 1;
1053 else if ( *ss == '2' ) ftr->phase = 2;
1054 else if ( *ss == '.' ) ftr->phase = 0; // exons do not have phase
1055 else { if ( args->verbosity > 0 ) fprintf(stderr,"Skipping unknown phase: %c, %s\n", *ss, line); return -1; }
1056 ss += 2;
1057
1058 // substring search for "Parent=transcript:ENST00000437963"
1059 ftr->trid = gff_id_parse(&args->tscript_ids, line, "Parent=transcript:", ss);
1060 ftr->iseq = feature_set_seq(args, chr_beg,chr_end);
1061 return 0;
1062 }
1063
cmp_cds_ptr(const void * a,const void * b)1064 static int cmp_cds_ptr(const void *a, const void *b)
1065 {
1066 // comparison function for qsort of transcripts's CDS
1067 if ( (*((gf_cds_t**)a))->beg < (*((gf_cds_t**)b))->beg ) return -1;
1068 if ( (*((gf_cds_t**)a))->beg > (*((gf_cds_t**)b))->beg ) return 1;
1069 return 0;
1070 }
1071
chr_beg_end(aux_t * aux,int iseq,char ** chr_beg,char ** chr_end)1072 static inline void chr_beg_end(aux_t *aux, int iseq, char **chr_beg, char **chr_end)
1073 {
1074 *chr_beg = *chr_end = aux->seq[iseq];
1075 while ( (*chr_end)[1] ) (*chr_end)++;
1076 }
tscript_init(aux_t * aux,uint32_t trid)1077 tscript_t *tscript_init(aux_t *aux, uint32_t trid)
1078 {
1079 khint_t k = kh_get(int2tscript, aux->id2tr, (int)trid);
1080 tscript_t *tr = (k == kh_end(aux->id2tr)) ? NULL : kh_val(aux->id2tr, k);
1081 assert( tr );
1082 return tr;
1083 }
register_cds(args_t * args,ftr_t * ftr)1084 void register_cds(args_t *args, ftr_t *ftr)
1085 {
1086 // Make the CDS searchable via idx_cds. Note we do not malloc tr->cds just yet.
1087 // ftr is the result of parsing a gff CDS line
1088 aux_t *aux = &args->init;
1089
1090 tscript_t *tr = tscript_init(aux, ftr->trid);
1091 if ( tr->strand != ftr->strand ) error("Conflicting strand in transcript %"PRIu32" .. %d vs %d\n",ftr->trid,tr->strand,ftr->strand);
1092
1093 gf_cds_t *cds = (gf_cds_t*) malloc(sizeof(gf_cds_t));
1094 cds->tr = tr;
1095 cds->beg = ftr->beg;
1096 cds->len = ftr->end - ftr->beg + 1;
1097 cds->icds = 0; // to keep valgrind on mac happy
1098 cds->phase = ftr->phase;
1099
1100 hts_expand(gf_cds_t*,tr->ncds+1,tr->mcds,tr->cds);
1101 tr->cds[tr->ncds++] = cds;
1102 }
register_utr(args_t * args,ftr_t * ftr)1103 void register_utr(args_t *args, ftr_t *ftr)
1104 {
1105 aux_t *aux = &args->init;
1106 gf_utr_t *utr = (gf_utr_t*) malloc(sizeof(gf_utr_t));
1107 utr->which = ftr->type==GF_UTR3 ? prime3 : prime5;
1108 utr->beg = ftr->beg;
1109 utr->end = ftr->end;
1110 utr->tr = tscript_init(aux, ftr->trid);
1111
1112 char *chr_beg, *chr_end;
1113 chr_beg_end(&args->init, utr->tr->gene->iseq, &chr_beg, &chr_end);
1114 regidx_push(args->idx_utr, chr_beg,chr_end, utr->beg,utr->end, &utr);
1115 }
register_exon(args_t * args,ftr_t * ftr)1116 void register_exon(args_t *args, ftr_t *ftr)
1117 {
1118 aux_t *aux = &args->init;
1119 gf_exon_t *exon = (gf_exon_t*) malloc(sizeof(gf_exon_t));
1120 exon->beg = ftr->beg;
1121 exon->end = ftr->end;
1122 exon->tr = tscript_init(aux, ftr->trid);
1123
1124 char *chr_beg, *chr_end;
1125 chr_beg_end(&args->init, exon->tr->gene->iseq, &chr_beg, &chr_end);
1126 regidx_push(args->idx_exon, chr_beg,chr_end, exon->beg - N_SPLICE_REGION_INTRON, exon->end + N_SPLICE_REGION_INTRON, &exon);
1127 }
1128
tscript_init_cds(args_t * args)1129 void tscript_init_cds(args_t *args)
1130 {
1131 aux_t *aux = &args->init;
1132
1133 // Sort CDS in all transcripts, set offsets, check their phase, length, create index (idx_cds)
1134 khint_t k;
1135 for (k=0; k<kh_end(aux->id2tr); k++)
1136 {
1137 if ( !kh_exist(aux->id2tr, k) ) continue;
1138 tscript_t *tr = (tscript_t*) kh_val(aux->id2tr, k);
1139
1140 // position-to-tscript lookup
1141 char *chr_beg, *chr_end;
1142 chr_beg_end(aux, tr->gene->iseq, &chr_beg, &chr_end);
1143 regidx_push(args->idx_tscript, chr_beg, chr_end, tr->beg, tr->end, &tr);
1144
1145 if ( !tr->ncds ) continue; // transcript with no CDS
1146
1147 // sort CDs
1148 qsort(tr->cds, tr->ncds, sizeof(gf_cds_t*), cmp_cds_ptr);
1149
1150 // trim non-coding start
1151 int i, len = 0;
1152 if ( tr->strand==STRAND_FWD )
1153 {
1154 if ( tr->cds[0]->phase ) tr->trim |= TRIM_5PRIME;
1155 tr->cds[0]->beg += tr->cds[0]->phase;
1156 tr->cds[0]->len -= tr->cds[0]->phase;
1157 tr->cds[0]->phase = 0;
1158
1159 // sanity check phase; the phase number in gff tells us how many bases to skip in this
1160 // feature to reach the first base of the next codon
1161 int tscript_ok = 1;
1162 for (i=0; i<tr->ncds; i++)
1163 {
1164 int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0;
1165 if ( phase!=len%3)
1166 {
1167 if ( args->force )
1168 {
1169 if ( args->verbosity > 0 )
1170 fprintf(stderr,"Warning: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
1171 tscript_ok = 0;
1172 break;
1173 }
1174 error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d). Use the --force option to proceed anyway (at your own risk).\n",
1175 args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
1176 }
1177 len += tr->cds[i]->len;
1178 }
1179 if ( !tscript_ok ) continue; // skip this transcript
1180 }
1181 else
1182 {
1183 // Check that the phase is not bigger than CDS length. Curiously, this can really happen,
1184 // see Mus_musculus.GRCm38.85.gff3.gz, transcript:ENSMUST00000163141
1185 // todo: the same for the fwd strand
1186 i = tr->ncds - 1;
1187 int phase = tr->cds[i]->phase;
1188 if ( phase ) tr->trim |= TRIM_5PRIME;
1189 while ( i>=0 && phase > tr->cds[i]->len )
1190 {
1191 phase -= tr->cds[i]->len;
1192 tr->cds[i]->phase = 0;
1193 tr->cds[i]->len = 0;
1194 i--;
1195 }
1196 tr->cds[i]->len -= tr->cds[i]->phase;
1197 tr->cds[i]->phase = 0;
1198
1199 // sanity check phase
1200 int tscript_ok = 1;
1201 for (i=tr->ncds-1; i>=0; i--)
1202 {
1203 int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0;
1204 if ( phase!=len%3)
1205 {
1206 if ( args->force )
1207 {
1208 if ( args->verbosity > 0 )
1209 fprintf(stderr,"Warning: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
1210 tscript_ok = 0;
1211 break;
1212 }
1213 error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d). Use the --force option to proceed anyway (at your own risk).\n",
1214 args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
1215 }
1216 len += tr->cds[i]->len;
1217 }
1218 if ( !tscript_ok ) continue; // skip this transcript
1219 }
1220
1221 // set len. At the same check that CDS within a transcript do not overlap
1222 len = 0;
1223 for (i=0; i<tr->ncds; i++)
1224 {
1225 tr->cds[i]->icds = i;
1226 len += tr->cds[i]->len;
1227 if ( !i ) continue;
1228
1229 gf_cds_t *a = tr->cds[i-1];
1230 gf_cds_t *b = tr->cds[i];
1231 if ( a->beg + a->len - 1 >= b->beg )
1232 {
1233 if ( args->force )
1234 {
1235 fprintf(stderr,"Warning: GFF contains overlapping CDS %s: %"PRIu32"-%"PRIu32" and %"PRIu32"-%"PRIu32".\n",
1236 args->tscript_ids.str[tr->id], a->beg+1,a->beg+a->len, b->beg+1,b->beg+b->len);
1237 }
1238 else
1239 error("Error: CDS overlap in the transcript %s: %"PRIu32"-%"PRIu32" and %"PRIu32"-%"PRIu32", is this intended (e.g. ribosomal slippage)?\n"
1240 " Use the --force option to override (at your own risk).\n",
1241 args->tscript_ids.str[tr->id], a->beg+1,a->beg+a->len, b->beg+1,b->beg+b->len);
1242 }
1243 }
1244 if ( len%3 != 0 )
1245 {
1246 // There are 13k transcripts with incomplete 3' CDS. See for example ENST00000524289
1247 // http://sep2015.archive.ensembl.org/Homo_sapiens/Transcript/Sequence_cDNA?db=core;g=ENSG00000155868;r=5:157138846-157159019;t=ENST00000524289
1248 // Also, the incomplete CDS can be too short (1 or 2bp), so it is not enough to trim the last one.
1249
1250 tr->trim |= TRIM_3PRIME;
1251 if ( tr->strand==STRAND_FWD )
1252 {
1253 i = tr->ncds - 1;
1254 while ( i>=0 && len%3 )
1255 {
1256 int dlen = tr->cds[i]->len >= len%3 ? len%3 : tr->cds[i]->len;
1257 tr->cds[i]->len -= dlen;
1258 len -= dlen;
1259 i--;
1260 }
1261 }
1262 else
1263 {
1264 i = 0;
1265 while ( i<tr->ncds && len%3 )
1266 {
1267 int dlen = tr->cds[i]->len >= len%3 ? len%3 : tr->cds[i]->len;
1268 tr->cds[i]->len -= dlen;
1269 tr->cds[i]->beg += dlen;
1270 len -= dlen;
1271 i++;
1272 }
1273 }
1274 }
1275
1276 // set CDS offsets and insert into regidx
1277 len=0;
1278 for (i=0; i<tr->ncds; i++)
1279 {
1280 tr->cds[i]->pos = len;
1281 len += tr->cds[i]->len;
1282 regidx_push(args->idx_cds, chr_beg,chr_end, tr->cds[i]->beg,tr->cds[i]->beg+tr->cds[i]->len-1, &tr->cds[i]);
1283 }
1284 }
1285 }
1286
regidx_free_gf(void * payload)1287 void regidx_free_gf(void *payload) { free(*((gf_cds_t**)payload)); }
regidx_free_tscript(void * payload)1288 void regidx_free_tscript(void *payload) { tscript_t *tr = *((tscript_t**)payload); free(tr->cds); free(tr); }
1289
init_gff(args_t * args)1290 void init_gff(args_t *args)
1291 {
1292 aux_t *aux = &args->init;
1293 aux->seq2int = khash_str2int_init(); // chrom's numeric id
1294 aux->gid2gene = kh_init(int2gene); // gene id to gf_gene_t, for idx_gene
1295 aux->id2tr = kh_init(int2tscript); // transcript id to tscript_t
1296 args->idx_tscript = regidx_init(NULL, NULL, regidx_free_tscript, sizeof(tscript_t*), NULL);
1297 aux->ignored_biotypes = khash_str2int_init();
1298 gff_id_init(&aux->gene_ids);
1299 gff_id_init(&args->tscript_ids);
1300
1301 // parse gff
1302 kstring_t str = {0,0,0};
1303 htsFile *fp = hts_open(args->gff_fname,"r");
1304 if ( !fp ) error("Failed to read %s\n", args->gff_fname);
1305 while ( hts_getline(fp, KS_SEP_LINE, &str) > 0 )
1306 {
1307 hts_expand(ftr_t, aux->nftr+1, aux->mftr, aux->ftr);
1308 int ret = gff_parse(args, str.s, aux->ftr + aux->nftr);
1309 if ( !ret ) aux->nftr++;
1310 }
1311 free(str.s);
1312 if ( hts_close(fp)!=0 ) error("Close failed: %s\n", args->gff_fname);
1313
1314
1315 // process gff information: connect CDS and exons to transcripts
1316 args->idx_cds = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_cds_t*), NULL);
1317 args->idx_utr = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_utr_t*), NULL);
1318 args->idx_exon = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_exon_t*), NULL);
1319 args->itr = regitr_init(NULL);
1320
1321 int i;
1322 for (i=0; i<aux->nftr; i++)
1323 {
1324 ftr_t *ftr = &aux->ftr[i];
1325
1326 // check whether to keep this feature: is there a mapping trid -> gene_id -> gene?
1327 khint_t k = kh_get(int2tscript, aux->id2tr, (int)ftr->trid);
1328 if ( k==kh_end(aux->id2tr) ) continue; // no such transcript
1329
1330 tscript_t *tr = kh_val(aux->id2tr,k);
1331 if ( !tr->gene->name )
1332 {
1333 // not a supported biotype (e.g. gene:pseudogene, transcript:processed_transcript)
1334 regidx_free_tscript(&tr);
1335 kh_del(int2tscript, aux->id2tr,k);
1336 continue;
1337 }
1338
1339 // populate regidx by category:
1340 // ftr->type .. GF_CDS, GF_EXON, GF_UTR3, GF_UTR5
1341 // gene->type .. GF_PROTEIN_CODING, GF_MT_rRNA, GF_IG_C, ...
1342 if ( ftr->type==GF_CDS ) register_cds(args, ftr);
1343 else if ( ftr->type==GF_EXON ) register_exon(args, ftr);
1344 else if ( ftr->type==GF_UTR5 ) register_utr(args, ftr);
1345 else if ( ftr->type==GF_UTR3 ) register_utr(args, ftr);
1346 else
1347 error("something: %s\t%d\t%d\t%s\t%s\n", aux->seq[ftr->iseq],ftr->beg+1,ftr->end+1,args->tscript_ids.str[ftr->trid],gf_type2gff_string(ftr->type));
1348 }
1349 tscript_init_cds(args);
1350
1351 if ( args->verbosity > 0 )
1352 {
1353 fprintf(stderr,"Indexed %d transcripts, %d exons, %d CDSs, %d UTRs\n",
1354 regidx_nregs(args->idx_tscript),
1355 regidx_nregs(args->idx_exon),
1356 regidx_nregs(args->idx_cds),
1357 regidx_nregs(args->idx_utr));
1358 }
1359
1360 free(aux->ftr);
1361 khash_str2int_destroy_free(aux->seq2int);
1362 // keeping only to destroy the genes at the end: kh_destroy(int2gene,aux->gid2gene);
1363 kh_destroy(int2tscript,aux->id2tr);
1364 free(aux->seq);
1365 gff_id_destroy(&aux->gene_ids);
1366
1367 if ( args->verbosity > 0 && khash_str2int_size(aux->ignored_biotypes) )
1368 {
1369 khash_t(str2int) *ign = (khash_t(str2int)*)aux->ignored_biotypes;
1370 fprintf(stderr,"Ignored the following biotypes:\n");
1371 for (i = kh_begin(ign); i < kh_end(ign); i++)
1372 {
1373 if ( !kh_exist(ign,i)) continue;
1374 const char *biotype = kh_key(ign,i);
1375 if ( !strcmp(biotype,"TCE") ) biotype = "TCE (\"To be Experimentally Confirmed\")";
1376 fprintf(stderr,"\t%dx\t.. %s\n", kh_value(ign,i), biotype);
1377 }
1378 }
1379 khash_str2int_destroy_free(aux->ignored_biotypes);
1380 }
1381
ncsq2_to_nfmt(int ncsq2)1382 static inline int ncsq2_to_nfmt(int ncsq2)
1383 {
1384 return 1 + (ncsq2 - 1) / 30;
1385 }
icsq2_to_bit(int icsq2,int * ival,int * ibit)1386 static inline void icsq2_to_bit(int icsq2, int *ival, int *ibit)
1387 {
1388 *ival = icsq2 / 30;
1389 *ibit = icsq2 % 30;
1390 }
1391
init_data(args_t * args)1392 void init_data(args_t *args)
1393 {
1394 args->nfmt_bcsq = ncsq2_to_nfmt(args->ncsq2_max);
1395
1396 args->fai = fai_load(args->fa_fname);
1397 if ( !args->fai ) error("Failed to load the fai index: %s\n", args->fa_fname);
1398
1399 if ( args->verbosity > 0 ) fprintf(stderr,"Parsing %s ...\n", args->gff_fname);
1400 init_gff(args);
1401
1402 args->rid = -1;
1403
1404 if ( args->filter_str )
1405 args->filter = filter_init(args->hdr, args->filter_str);
1406
1407 args->pos2vbuf = kh_init(pos2vbuf);
1408 args->active_tr = khp_init(trhp);
1409 args->hap = (hap_t*) calloc(1,sizeof(hap_t));
1410
1411 // init samples
1412 if ( !bcf_hdr_nsamples(args->hdr) ) args->phase = PHASE_DROP_GT;
1413 if ( args->sample_list && !strcmp("-",args->sample_list) )
1414 {
1415 // ignore all samples
1416 if ( args->output_type==FT_TAB_TEXT )
1417 {
1418 // significant speedup for plain VCFs
1419 if (bcf_hdr_set_samples(args->hdr,NULL,0) < 0)
1420 error_errno("[%s] Couldn't build sample filter", __func__);
1421 }
1422 args->phase = PHASE_DROP_GT;
1423 }
1424 else
1425 args->smpl = smpl_ilist_init(args->hdr, args->sample_list, args->sample_is_file, SMPL_STRICT);
1426 args->hdr_nsmpl = args->phase==PHASE_DROP_GT ? 0 : bcf_hdr_nsamples(args->hdr);
1427
1428 if ( args->output_type==FT_TAB_TEXT )
1429 {
1430 args->out = args->output_fname ? fopen(args->output_fname,"w") : stdout;
1431 if ( !args->out ) error("Failed to write to %s: %s\n", !strcmp("-",args->output_fname)?"standard output":args->output_fname,strerror(errno));
1432
1433 fprintf(args->out,"# This file was produced by: bcftools +csq(%s+htslib-%s)\n", bcftools_version(),hts_version());
1434 fprintf(args->out,"# The command line was:\tbcftools +%s", args->argv[0]);
1435 int i;
1436 for (i=1; i<args->argc; i++)
1437 fprintf(args->out," %s",args->argv[i]);
1438 fprintf(args->out,"\n");
1439 fprintf(args->out,"# LOG\t[2]Message\n");
1440 fprintf(args->out,"# CSQ"); i = 1;
1441 fprintf(args->out,"\t[%d]Sample", ++i);
1442 fprintf(args->out,"\t[%d]Haplotype", ++i);
1443 fprintf(args->out,"\t[%d]Chromosome", ++i);
1444 fprintf(args->out,"\t[%d]Position", ++i);
1445 fprintf(args->out,"\t[%d]Consequence", ++i);
1446 fprintf(args->out,"\n");
1447 }
1448 else
1449 {
1450 char wmode[8];
1451 set_wmode(wmode,args->output_type,args->output_fname,args->clevel);
1452 args->out_fh = hts_open(args->output_fname ? args->output_fname : "-", wmode);
1453 if ( args->out_fh == NULL ) error("[%s] Error: cannot write to %s: %s\n", __func__,args->output_fname? args->output_fname : "standard output", strerror(errno));
1454 if ( args->n_threads > 0)
1455 hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->sr->p);
1456 if ( args->record_cmd_line ) bcf_hdr_append_version(args->hdr,args->argc,args->argv,"bcftools/csq");
1457 bcf_hdr_printf(args->hdr,"##INFO=<ID=%s,Number=.,Type=String,Description=\"%s consequence annotation from BCFtools/csq, see http://samtools.github.io/bcftools/howtos/csq-calling.html for details. Format: Consequence|gene|transcript|biotype|strand|amino_acid_change|dna_change\">",args->bcsq_tag, args->local_csq ? "Local" : "Haplotype-aware");
1458 if ( args->hdr_nsmpl )
1459 bcf_hdr_printf(args->hdr,"##FORMAT=<ID=%s,Number=.,Type=Integer,Description=\"Bitmask of indexes to INFO/BCSQ, with interleaved first/second haplotype. Use \\\"bcftools query -f'[%%CHROM\\t%%POS\\t%%SAMPLE\\t%%TBCSQ\\n]'\\\" to translate.\">",args->bcsq_tag);
1460 if ( bcf_hdr_write(args->out_fh, args->hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname?args->output_fname:"standard output");
1461 }
1462 if ( args->verbosity > 0 ) fprintf(stderr,"Calling...\n");
1463 }
1464
destroy_data(args_t * args)1465 void destroy_data(args_t *args)
1466 {
1467 if ( args->ncsq2_small_warned )
1468 fprintf(stderr,
1469 "Note: Some samples had too many consequences to be represented in %d bytes. If you need to record them all,\n"
1470 " the limit can be increased by running with `--ncsq %d`.\n",ncsq2_to_nfmt(args->ncsq2_max)/8,1+args->ncsq2_small_warned/2);
1471
1472 regidx_destroy(args->idx_cds);
1473 regidx_destroy(args->idx_utr);
1474 regidx_destroy(args->idx_exon);
1475 regidx_destroy(args->idx_tscript);
1476 regitr_destroy(args->itr);
1477
1478 khint_t k,i,j;
1479 for (k=0; k<kh_end(args->init.gid2gene); k++)
1480 {
1481 if ( !kh_exist(args->init.gid2gene, k) ) continue;
1482 gf_gene_t *gene = (gf_gene_t*) kh_val(args->init.gid2gene, k);
1483 free(gene->name);
1484 free(gene);
1485 }
1486 kh_destroy(int2gene,args->init.gid2gene);
1487
1488 if ( args->filter )
1489 filter_destroy(args->filter);
1490
1491 khp_destroy(trhp,args->active_tr);
1492 kh_destroy(pos2vbuf,args->pos2vbuf);
1493 if ( args->smpl ) smpl_ilist_destroy(args->smpl);
1494 int ret;
1495 if ( args->out_fh )
1496 ret = hts_close(args->out_fh);
1497 else
1498 ret = fclose(args->out);
1499 if ( ret ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout");
1500 for (i=0; i<args->vcf_rbuf.m; i++)
1501 {
1502 vbuf_t *vbuf = args->vcf_buf[i];
1503 if ( !vbuf ) continue;
1504 for (j=0; j<vbuf->m; j++)
1505 {
1506 if ( !vbuf->vrec[j] ) continue;
1507 if ( vbuf->vrec[j]->line ) bcf_destroy(vbuf->vrec[j]->line);
1508 free(vbuf->vrec[j]->fmt_bm);
1509 free(vbuf->vrec[j]->vcsq);
1510 free(vbuf->vrec[j]);
1511 }
1512 free(vbuf->vrec);
1513 free(vbuf);
1514 }
1515 free(args->vcf_buf);
1516 free(args->rm_tr);
1517 free(args->csq_buf);
1518 free(args->hap->stack);
1519 free(args->hap->sseq.s);
1520 free(args->hap->tseq.s);
1521 free(args->hap->tref.s);
1522 free(args->hap);
1523 fai_destroy(args->fai);
1524 free(args->gt_arr);
1525 free(args->str.s);
1526 free(args->str2.s);
1527 gff_id_destroy(&args->tscript_ids);
1528 }
1529
1530 /*
1531 The splice_* functions are for consquences around splice sites: start,stop,splice_*
1532 */
1533 #define SPLICE_VAR_REF 0 // ref: ACGT>ACGT, csq not applicable, skip completely
1534 #define SPLICE_OUTSIDE 1 // splice acceptor or similar; csq set and is done, does not overlap the region
1535 #define SPLICE_INSIDE 2 // overlaps coding region; csq can be set but coding prediction is needed
1536 #define SPLICE_OVERLAP 3 // indel overlaps region boundary, csq set but could not determine csq
1537 typedef struct
1538 {
1539 tscript_t *tr;
1540 struct {
1541 int32_t pos, rlen, alen, ial;
1542 char *ref, *alt;
1543 bcf1_t *rec;
1544 } vcf;
1545 uint16_t check_acceptor:1, // check distance from exon start (fwd) or end (rev)
1546 check_start:1, // this is the first coding exon (relative to transcript orientation), check first (fwd) or last (rev) codon
1547 check_stop:1, // this is the last coding exon (relative to transcript orientation), check last (fwd) or first (rev) codon
1548 check_donor:1, // as with check_acceptor
1549 check_region_beg:1, // do/don't check for splices at this end, eg. in the first or last exon
1550 check_region_end:1, //
1551 check_utr:1, // check splice sites (acceptor/donor/region_*) only if not in utr
1552 set_refalt:1; // set kref,kalt, if set, check also for synonymous events
1553 uint32_t csq;
1554 int tbeg, tend; // number of trimmed bases from beg and end of ref,alt allele
1555 uint32_t ref_beg, // ref coordinates with spurious bases removed, ACC>AC can become AC>A or CC>C, whichever gives
1556 ref_end; // a more conservative csq (the first and last base in kref.s)
1557 kstring_t kref, kalt; // trimmed alleles, set only with SPLICE_OLAP
1558 }
1559 splice_t;
splice_init(splice_t * splice,bcf1_t * rec)1560 void splice_init(splice_t *splice, bcf1_t *rec)
1561 {
1562 memset(splice,0,sizeof(*splice));
1563 splice->vcf.rec = rec;
1564 splice->vcf.pos = rec->pos;
1565 splice->vcf.rlen = rec->rlen;
1566 splice->vcf.ref = rec->d.allele[0];
1567 splice->csq = 0;
1568 }
splice_build_hap(splice_t * splice,uint32_t beg,int len)1569 static inline void splice_build_hap(splice_t *splice, uint32_t beg, int len)
1570 {
1571 // len>0 .. beg is the first base, del filled from right
1572 // len<0 .. beg is the last base, del filled from left
1573
1574 int rlen, alen, rbeg, abeg; // first base to include (ref coordinates)
1575 if ( len<0 )
1576 {
1577 rlen = alen = -len;
1578 rbeg = beg - rlen + 1;
1579 int dlen = splice->vcf.alen - splice->vcf.rlen;
1580 if ( dlen<0 && beg < splice->ref_end ) // incomplete del, beg is in the middle
1581 dlen += splice->ref_end - beg;
1582 abeg = rbeg + dlen;
1583 }
1584 else
1585 {
1586 rbeg = abeg = beg;
1587 rlen = alen = len;
1588 // check for incomplete del as above??
1589 }
1590
1591 #define XDBG 0
1592 #if XDBG
1593 fprintf(stderr,"build_hap: rbeg=%d + %d abeg=%d \n",rbeg,rlen,abeg);
1594 #endif
1595 splice->kref.l = 0;
1596 splice->kalt.l = 0;
1597
1598 // add the part before vcf.ref, in the vcf.ref and after vcf.ref
1599 int roff; // how many vcf.ref bases already used
1600 if ( rbeg < splice->vcf.pos )
1601 {
1602 assert( splice->tr->beg <= rbeg ); // this can be extended thanks to N_REF_PAD
1603 kputsn(splice->tr->ref + N_REF_PAD + rbeg - splice->tr->beg, splice->vcf.pos - rbeg, &splice->kref);
1604 roff = 0;
1605 }
1606 else
1607 roff = rbeg - splice->vcf.pos;
1608 #if XDBG
1609 fprintf(stderr,"r1: %s roff=%d\n",splice->kref.s,roff);
1610 #endif
1611
1612 if ( roff < splice->vcf.rlen && splice->kref.l < rlen )
1613 {
1614 int len = splice->vcf.rlen - roff; // len still available in vcf.ref
1615 if ( len > rlen - splice->kref.l ) len = rlen - splice->kref.l; // how much of ref allele is still needed
1616 kputsn(splice->vcf.ref + roff, len, &splice->kref);
1617 }
1618 #if XDBG
1619 fprintf(stderr,"r2: %s\n",splice->kref.s);
1620 #endif
1621
1622 uint32_t end = splice->vcf.pos + splice->vcf.rlen; // position just after the ref allele
1623 if ( splice->kref.l < rlen )
1624 {
1625 if ( end + rlen - splice->kref.l - 1 > splice->tr->end ) // trim, the requested sequence is too long (could be extended, see N_REF_PAD)
1626 rlen -= end + rlen - splice->kref.l - 1 - splice->tr->end;
1627 if ( splice->kref.l < rlen )
1628 kputsn(splice->tr->ref + N_REF_PAD + end - splice->tr->beg, rlen - splice->kref.l, &splice->kref);
1629 }
1630 #if XDBG
1631 fprintf(stderr,"r3: %s\n",splice->kref.s);
1632 #endif
1633
1634
1635 int aoff;
1636 if ( abeg < splice->vcf.pos )
1637 {
1638 assert( splice->tr->beg <= abeg );
1639 kputsn(splice->tr->ref + N_REF_PAD + abeg - splice->tr->beg, splice->vcf.pos - abeg, &splice->kalt);
1640 aoff = 0;
1641 }
1642 else
1643 aoff = abeg - splice->vcf.pos;
1644 #if XDBG
1645 fprintf(stderr,"a1: %s aoff=%d\n",splice->kalt.s,aoff);
1646 #endif
1647
1648 if ( aoff < splice->vcf.alen && splice->kalt.l < alen )
1649 {
1650 int len = splice->vcf.alen - aoff; // len still available in vcf.alt
1651 if ( len > alen - splice->kalt.l ) len = alen - splice->kalt.l; // how much of alt allele is still needed
1652 kputsn(splice->vcf.alt + aoff, len, &splice->kalt);
1653 aoff -= len;
1654 }
1655 if ( aoff < 0 ) aoff = 0;
1656 else aoff--;
1657 #if XDBG
1658 fprintf(stderr,"a2: %s aoff=%d\n",splice->kalt.s,aoff);
1659 #endif
1660
1661 end = splice->vcf.pos + splice->vcf.rlen; // position just after the ref allele
1662 if ( splice->kalt.l < alen )
1663 {
1664 if ( end + alen + aoff - splice->kalt.l - 1 > splice->tr->end ) // trim, the requested sequence is too long
1665 alen -= end + alen + aoff - splice->kalt.l - 1 - splice->tr->end;
1666 if ( alen > 0 && alen > splice->kalt.l )
1667 kputsn(splice->tr->ref + aoff + N_REF_PAD + end - splice->tr->beg, alen - splice->kalt.l, &splice->kalt);
1668 }
1669 #if XDBG
1670 fprintf(stderr,"a3: %s\n",splice->kalt.s);
1671 fprintf(stderr," [%s]\n [%s]\n\n",splice->kref.s,splice->kalt.s);
1672 #endif
1673 }
1674 void csq_stage(args_t *args, csq_t *csq, bcf1_t *rec);
csq_stage_utr(args_t * args,regitr_t * itr,bcf1_t * rec,uint32_t trid,uint32_t type,int ial)1675 static inline int csq_stage_utr(args_t *args, regitr_t *itr, bcf1_t *rec, uint32_t trid, uint32_t type, int ial)
1676 {
1677 while ( regitr_overlap(itr) )
1678 {
1679 gf_utr_t *utr = regitr_payload(itr, gf_utr_t*);
1680 tscript_t *tr = utr->tr;
1681 if ( tr->id != trid ) continue;
1682 csq_t csq;
1683 memset(&csq, 0, sizeof(csq_t));
1684 csq.pos = rec->pos;
1685 csq.type.type = (utr->which==prime5 ? CSQ_UTR5 : CSQ_UTR3) | type;
1686 csq.type.biotype = tr->type;
1687 csq.type.strand = tr->strand;
1688 csq.type.trid = tr->id;
1689 csq.type.vcf_ial = ial;
1690 csq.type.gene = tr->gene->name;
1691 csq_stage(args, &csq, rec);
1692 return csq.type.type;
1693 }
1694 return 0;
1695 }
csq_stage_splice(args_t * args,bcf1_t * rec,tscript_t * tr,uint32_t type,int ial)1696 static inline void csq_stage_splice(args_t *args, bcf1_t *rec, tscript_t *tr, uint32_t type, int ial)
1697 {
1698 #if XDBG
1699 fprintf(stderr,"csq_stage_splice %d: type=%d\n",rec->pos+1,type);
1700 #endif
1701 if ( !type ) return;
1702 csq_t csq;
1703 memset(&csq, 0, sizeof(csq_t));
1704 csq.pos = rec->pos;
1705 csq.type.type = type;
1706 csq.type.biotype = tr->type;
1707 csq.type.strand = tr->strand;
1708 csq.type.trid = tr->id;
1709 csq.type.vcf_ial = ial;
1710 csq.type.gene = tr->gene->name;
1711 csq_stage(args, &csq, rec);
1712 }
splice_csq_ins(args_t * args,splice_t * splice,uint32_t ex_beg,uint32_t ex_end)1713 static inline int splice_csq_ins(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end)
1714 {
1715 // coordinates that matter for consequences, eg AC>ACG trimmed to C>CG, 1bp
1716 // before and after the inserted bases
1717 if ( splice->tbeg || splice->vcf.ref[0]!=splice->vcf.alt[0] )
1718 {
1719 splice->ref_beg = splice->vcf.pos + splice->tbeg - 1;
1720 splice->ref_end = splice->vcf.pos + splice->vcf.rlen - splice->tend;
1721 }
1722 else
1723 {
1724 if ( splice->tend ) splice->tend--;
1725 splice->ref_beg = splice->vcf.pos;
1726 splice->ref_end = splice->vcf.pos + splice->vcf.rlen - splice->tend;
1727 }
1728 #if XDBG
1729 fprintf(stderr,"ins: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_utr=%d start,stop,beg,end=%d,%d,%d,%d\n", splice->vcf.ref,splice->vcf.alt,ex_beg,ex_end,splice->ref_beg,splice->ref_end,splice->tbeg,splice->tend,splice->check_utr,splice->check_start,splice->check_stop,splice->check_region_beg,splice->check_region_end);
1730 #endif
1731
1732 int ret;
1733 if ( splice->ref_beg >= ex_end ) // fully outside, beyond the exon
1734 {
1735 if ( splice->check_utr )
1736 {
1737 regitr_t *itr = regitr_init(NULL);
1738 const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
1739 if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg+1,splice->ref_beg+1, itr) ) // adjacent utr
1740 {
1741 ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
1742 if ( ret!=0 )
1743 {
1744 regitr_destroy(itr);
1745 return SPLICE_OUTSIDE; // overlaps utr
1746 }
1747 }
1748 regitr_destroy(itr);
1749 }
1750 if ( !splice->check_region_end ) return SPLICE_OUTSIDE;
1751 char *ref = NULL, *alt = NULL;
1752 if ( splice->set_refalt ) // seq identity is checked only when tr->ref is available
1753 {
1754 splice_build_hap(splice, ex_end+1, N_SPLICE_REGION_INTRON);
1755 ref = splice->kref.s, alt = splice->kalt.s;
1756 }
1757 if ( splice->ref_beg < ex_end + N_SPLICE_REGION_INTRON && splice->ref_end > ex_end + N_SPLICE_DONOR )
1758 {
1759 splice->csq |= CSQ_SPLICE_REGION;
1760 if ( ref && !strncmp(ref,alt,N_SPLICE_REGION_INTRON) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT;
1761 }
1762 if ( splice->ref_beg < ex_end + N_SPLICE_DONOR )
1763 {
1764 if ( splice->check_donor && splice->tr->strand==STRAND_FWD ) splice->csq |= CSQ_SPLICE_DONOR;
1765 if ( splice->check_acceptor && splice->tr->strand==STRAND_REV ) splice->csq |= CSQ_SPLICE_ACCEPTOR;
1766 if ( ref && !strncmp(ref,alt,N_SPLICE_DONOR) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT;
1767 }
1768 csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq, splice->vcf.ial);
1769 return SPLICE_OUTSIDE;
1770 }
1771 if ( splice->ref_end < ex_beg || (splice->ref_end == ex_beg && !splice->check_region_beg) ) // fully outside, before the exon
1772 {
1773 if ( splice->check_utr )
1774 {
1775 regitr_t *itr = regitr_init(NULL);
1776 const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
1777 if ( regidx_overlap(args->idx_utr,chr,splice->ref_end-1,splice->ref_end-1, itr) ) // adjacent utr
1778 {
1779 ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
1780 if ( ret!=0 )
1781 {
1782 regitr_destroy(itr);
1783 return SPLICE_OUTSIDE; // overlaps utr
1784 }
1785 }
1786 regitr_destroy(itr);
1787 }
1788 if ( !splice->check_region_beg ) return SPLICE_OUTSIDE;
1789 char *ref = NULL, *alt = NULL;
1790 if ( splice->set_refalt ) // seq identity is checked only when tr->ref is available
1791 {
1792 splice_build_hap(splice, ex_beg - N_SPLICE_REGION_INTRON, N_SPLICE_REGION_INTRON);
1793 ref = splice->kref.s, alt = splice->kalt.s;
1794 }
1795 if ( splice->ref_end > ex_beg - N_SPLICE_REGION_INTRON && splice->ref_beg < ex_beg - N_SPLICE_DONOR )
1796 {
1797 splice->csq |= CSQ_SPLICE_REGION;
1798 if ( ref && !strncmp(ref,alt,N_SPLICE_REGION_INTRON) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT;
1799 }
1800 if ( splice->ref_end > ex_beg - N_SPLICE_DONOR )
1801 {
1802 if ( splice->check_donor && splice->tr->strand==STRAND_REV ) splice->csq |= CSQ_SPLICE_DONOR;
1803 if ( splice->check_acceptor && splice->tr->strand==STRAND_FWD ) splice->csq |= CSQ_SPLICE_ACCEPTOR;
1804 if ( ref && !strncmp(ref+N_SPLICE_REGION_INTRON-N_SPLICE_DONOR,alt+N_SPLICE_REGION_INTRON-N_SPLICE_DONOR,N_SPLICE_DONOR) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT;
1805 }
1806 csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq, splice->vcf.ial);
1807 return SPLICE_OUTSIDE;
1808 }
1809 // overlaps the exon or inside the exon
1810 // possible todo: find better alignment for frameshifting variants?
1811 if ( splice->ref_beg <= ex_beg + 2 ) // in the first 3bp
1812 {
1813 if ( splice->check_region_beg ) splice->csq |= CSQ_SPLICE_REGION;
1814 if ( splice->tr->strand==STRAND_FWD ) { if ( splice->check_start ) splice->csq |= CSQ_START_LOST; }
1815 else { if ( splice->check_stop ) splice->csq |= CSQ_STOP_LOST; }
1816 }
1817 if ( splice->ref_end > ex_end - 2 )
1818 {
1819 if ( splice->check_region_end ) splice->csq |= CSQ_SPLICE_REGION;
1820 if ( splice->tr->strand==STRAND_REV ) { if ( splice->check_start ) splice->csq |= CSQ_START_LOST; }
1821 else { if ( splice->check_stop ) splice->csq |= CSQ_STOP_LOST; }
1822 }
1823 if ( splice->set_refalt )
1824 {
1825 // Make sure the variant will not end up left aligned to avoid overlapping vcf records
1826 // splice_build_hap(splice, splice->ref_beg, splice->vcf.alen - splice->tend - splice->tbeg + 1);
1827 // splice->vcf.rlen -= splice->tbeg + splice->tend - 1;
1828 // if ( splice->kref.l > splice->vcf.rlen ) { splice->kref.l = splice->vcf.rlen; splice->kref.s[splice->kref.l] = 0; }
1829 if ( splice->ref_beg < splice->vcf.pos ) // this must have been caused by too much trimming from right
1830 {
1831 int dlen = splice->vcf.pos - splice->ref_beg;
1832 assert( dlen==1 );
1833 splice->tbeg += dlen;
1834 if ( splice->tbeg + splice->tend == splice->vcf.rlen ) splice->tend -= dlen;
1835 splice->ref_beg = splice->vcf.pos;
1836 }
1837 if ( splice->ref_end==ex_beg ) splice->tend--; // prevent zero-length ref allele
1838 splice_build_hap(splice, splice->ref_beg, splice->vcf.alen - splice->tend - splice->tbeg + 1);
1839 splice->vcf.rlen -= splice->tbeg + splice->tend - 1;
1840 if ( splice->kref.l > splice->vcf.rlen ) { splice->kref.l = splice->vcf.rlen; splice->kref.s[splice->kref.l] = 0; }
1841 }
1842 csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq, splice->vcf.ial);
1843 return SPLICE_INSIDE;
1844 }
1845
shifted_del_synonymous(args_t * args,splice_t * splice,uint32_t ex_beg,uint32_t ex_end)1846 int shifted_del_synonymous(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end)
1847 {
1848 static int small_ref_padding_warned = 0;
1849 tscript_t *tr = splice->tr;
1850
1851 // We know the VCF record overlaps the exon, but does it overlap the start codon?
1852 if ( tr->strand==STRAND_REV && splice->vcf.pos + splice->vcf.rlen + 2 <= ex_end ) return 0;
1853 if ( tr->strand==STRAND_FWD && splice->vcf.pos >= ex_beg + 3 ) return 0;
1854
1855 #if XDBG
1856 fprintf(stderr,"shifted_del_synonymous: %d-%d %s\n",ex_beg,ex_end, tr->strand==STRAND_FWD?"fwd":"rev");
1857 fprintf(stderr," %d .. %s > %s\n",splice->vcf.pos+1,splice->vcf.ref,splice->vcf.alt);
1858 #endif
1859
1860 // is there enough ref sequence for the extension? All coordinates are 0-based
1861 int ref_len = strlen(splice->vcf.ref);
1862 int alt_len = strlen(splice->vcf.alt);
1863 assert( ref_len > alt_len );
1864 int ndel = ref_len - alt_len;
1865
1866 if ( tr->strand==STRAND_REV )
1867 {
1868 int32_t vcf_ref_end = splice->vcf.pos + ref_len - 1; // end pos of the VCF REF allele
1869 int32_t tr_ref_end = splice->tr->end + N_REF_PAD; // the end pos of accessible cached ref seq
1870 if ( vcf_ref_end + ndel > tr_ref_end )
1871 {
1872 if ( !small_ref_padding_warned )
1873 {
1874 fprintf(stderr,"Warning: Could not verify synonymous start/stop at %s:%d due to small N_REF_PAD. (Improve me?)\n",bcf_seqname(args->hdr,splice->vcf.rec),splice->vcf.pos+1);
1875 small_ref_padding_warned = 1;
1876 }
1877 return 0;
1878 }
1879
1880 char *ptr_vcf = splice->vcf.ref + alt_len; // the first deleted base in the VCF REF allele
1881 char *ptr_ref = splice->tr->ref + N_REF_PAD + (vcf_ref_end + 1 - splice->tr->beg); // the first ref base after the ndel bases deleted
1882 #if XDBG
1883 fprintf(stderr,"vcf: %s\nref: %s\n",ptr_vcf,ptr_ref);
1884 #endif
1885 int i = 0;
1886 while ( ptr_vcf[i] && ptr_vcf[i]==ptr_ref[i] ) i++;
1887 if ( ptr_vcf[i] ) return 0; // the deleted sequence cannot be replaced
1888 }
1889 else
1890 {
1891 // STRAND_FWD
1892 int32_t vcf_block_beg = splice->vcf.pos + ref_len - 2*ndel; // the position of the first base of the ref block that could potentially replace the deletion
1893 if ( vcf_block_beg < 0 ) return 0;
1894
1895 #if XDBG
1896 fprintf(stderr,"vcf_block_beg: %d\n",vcf_block_beg+1);
1897 #endif
1898
1899 if ( N_REF_PAD + vcf_block_beg < ex_beg )
1900 {
1901 if ( !small_ref_padding_warned )
1902 {
1903 fprintf(stderr,"Warning: Could not verify synonymous start/stop at %s:%d due to small N_REF_PAD. (Improve me?)\n",bcf_seqname(args->hdr,splice->vcf.rec),splice->vcf.pos+1);
1904 small_ref_padding_warned = 1;
1905 }
1906 return 0;
1907 }
1908
1909 char *ptr_vcf = splice->vcf.ref + alt_len; // the first deleted base in the VCF REF allele
1910 char *ptr_ref = splice->tr->ref + N_REF_PAD + vcf_block_beg - splice->tr->beg; // the replacement ref block
1911 #if XDBG
1912 fprintf(stderr,"vcf: %s\nref: %s\n",ptr_vcf,ptr_ref);
1913 #endif
1914
1915 int i = 0;
1916 while ( ptr_vcf[i] && ptr_vcf[i]==ptr_ref[i] ) i++;
1917 if ( ptr_vcf[i] ) return 0; // the deleted sequence cannot be replaced
1918 }
1919
1920 return 1;
1921 }
1922
splice_csq_del(args_t * args,splice_t * splice,uint32_t ex_beg,uint32_t ex_end)1923 static inline int splice_csq_del(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end)
1924 {
1925 if ( splice->check_start )
1926 {
1927 // check for synonymous start
1928 // test/csq/ENST00000375992/incorrect-synon-del-not-start-lost.txt
1929 // test/csq/ENST00000368801.2/start-lost.txt
1930 // test/csq/ENST00000318249.2/synonymous-start-lost.txt
1931 int is_synonymous = shifted_del_synonymous(args, splice, ex_beg, ex_end);
1932 if ( is_synonymous )
1933 {
1934 splice->csq |= CSQ_START_RETAINED;
1935 return SPLICE_OVERLAP;
1936 }
1937 }
1938
1939 // coordinates that matter for consequences, eg AC>ACG trimmed to C>CG
1940 splice->ref_beg = splice->vcf.pos + splice->tbeg - 1; // 1b before the deleted base
1941 splice->ref_end = splice->vcf.pos + splice->vcf.rlen - splice->tend - 1; // the last deleted base
1942
1943 #if XDBG
1944 fprintf(stderr,"splice_csq_del: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_utr=%d start,stop,beg,end=%d,%d,%d,%d\n", splice->vcf.ref,splice->vcf.alt,ex_beg,ex_end,splice->ref_beg,splice->ref_end,splice->tbeg,splice->tend,splice->check_utr,splice->check_start,splice->check_stop,splice->check_region_beg,splice->check_region_end);
1945 #endif
1946
1947 if ( splice->ref_beg + 1 < ex_beg ) // the part before the exon; ref_beg is off by -1
1948 {
1949 if ( splice->check_region_beg )
1950 {
1951 int csq = 0;
1952 if ( splice->check_utr )
1953 {
1954 regitr_t *itr = regitr_init(NULL);
1955 const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
1956 if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg,ex_beg-1, itr) ) // adjacent utr
1957 csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
1958 regitr_destroy(itr);
1959 }
1960 if ( !csq )
1961 {
1962 char *ref = NULL, *alt = NULL;
1963 if ( splice->set_refalt ) // seq identity is checked only when tr->ref is available
1964 {
1965 // filling from the left does not work for ENST00000341065/frame3.vcf
1966 // CAG.GTGGCCAG CAG.GTGGCCAG
1967 // CA-.--GGCCAG vs CAG.---GCCAG
1968 // splice_build_hap(splice, ex_beg-1, -N_SPLICE_REGION_INTRON);
1969 //
1970 // filling from the right:
1971 splice_build_hap(splice, ex_beg - N_SPLICE_REGION_INTRON, N_SPLICE_REGION_INTRON);
1972 ref = splice->kref.s, alt = splice->kalt.s;
1973 }
1974 if ( splice->ref_end >= ex_beg - N_SPLICE_REGION_INTRON && splice->ref_beg < ex_beg - N_SPLICE_DONOR )
1975 {
1976 splice->csq |= CSQ_SPLICE_REGION;
1977 if ( ref && alt && !strncmp(ref,alt,N_SPLICE_REGION_INTRON) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT;
1978 }
1979 if ( splice->ref_end >= ex_beg - N_SPLICE_DONOR )
1980 {
1981 if ( splice->check_donor && splice->tr->strand==STRAND_REV ) splice->csq |= CSQ_SPLICE_DONOR;
1982 if ( splice->check_acceptor && splice->tr->strand==STRAND_FWD ) splice->csq |= CSQ_SPLICE_ACCEPTOR;
1983 if ( ref && alt && !strncmp(ref+N_SPLICE_REGION_INTRON-N_SPLICE_DONOR,alt+N_SPLICE_REGION_INTRON-N_SPLICE_DONOR,N_SPLICE_DONOR) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT;
1984 }
1985 }
1986 }
1987 if ( splice->ref_end >= ex_beg )
1988 {
1989 splice->tbeg = splice->ref_beg - splice->vcf.pos + 1;
1990 splice->ref_beg = ex_beg - 1;
1991 if ( splice->tbeg + splice->tend == splice->vcf.alen )
1992 {
1993 // the deletion overlaps ex_beg and cannot be easily realigned to the right
1994 if ( !splice->tend )
1995 {
1996 splice->csq |= CSQ_CODING_SEQUENCE;
1997 return SPLICE_OVERLAP;
1998 }
1999 splice->tend--;
2000 }
2001 }
2002 }
2003 if ( ex_end < splice->ref_end ) // the part after the exon
2004 {
2005 if ( splice->check_region_end )
2006 {
2007 int csq = 0;
2008 if ( splice->check_utr )
2009 {
2010 regitr_t *itr = regitr_init(NULL);
2011 const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
2012 if ( regidx_overlap(args->idx_utr,chr,ex_end+1,splice->ref_end, itr) ) // adjacent utr
2013 csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
2014 regitr_destroy(itr);
2015 }
2016 if ( !csq )
2017 {
2018 char *ref = NULL, *alt = NULL;
2019 if ( splice->set_refalt ) // seq identity is checked only when tr->ref is available
2020 {
2021 splice_build_hap(splice, ex_end+1, N_SPLICE_REGION_INTRON); // ref,alt positioned at the first intron base
2022 ref = splice->kref.s, alt = splice->kalt.s;
2023 }
2024 if ( splice->ref_beg < ex_end + N_SPLICE_REGION_INTRON && splice->ref_end > ex_end + N_SPLICE_DONOR )
2025 {
2026 splice->csq |= CSQ_SPLICE_REGION;
2027 if ( ref && alt && !strncmp(ref,alt,N_SPLICE_REGION_INTRON) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT;
2028 }
2029 if ( splice->ref_beg < ex_end + N_SPLICE_DONOR )
2030 {
2031 if ( splice->check_donor && splice->tr->strand==STRAND_FWD ) splice->csq |= CSQ_SPLICE_DONOR;
2032 if ( splice->check_acceptor && splice->tr->strand==STRAND_REV ) splice->csq |= CSQ_SPLICE_ACCEPTOR;
2033 if ( ref && alt && !strncmp(ref+N_SPLICE_REGION_INTRON-N_SPLICE_DONOR,alt+N_SPLICE_REGION_INTRON-N_SPLICE_DONOR,N_SPLICE_DONOR) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT;
2034 }
2035 }
2036 }
2037 if ( splice->ref_beg < ex_end )
2038 {
2039 splice->tend = splice->vcf.rlen - (splice->ref_end - splice->vcf.pos + 1);
2040 splice->ref_end = ex_end;
2041 }
2042 }
2043 if ( splice->ref_end < ex_beg || splice->ref_beg >= ex_end )
2044 {
2045 csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq, splice->vcf.ial);
2046 return SPLICE_OUTSIDE;
2047 }
2048 if ( splice->ref_beg < ex_beg + 2 ) // ref_beg is off by -1
2049 {
2050 if ( splice->check_region_beg ) splice->csq |= CSQ_SPLICE_REGION;
2051 if ( splice->tr->strand==STRAND_FWD ) { if ( splice->check_start ) splice->csq |= CSQ_START_LOST; }
2052 else { if ( splice->check_stop ) splice->csq |= CSQ_STOP_LOST; }
2053 }
2054 if ( splice->ref_end > ex_end - 3 )
2055 {
2056 if ( splice->check_region_end ) splice->csq |= CSQ_SPLICE_REGION;
2057 if ( splice->tr->strand==STRAND_REV ) { if ( splice->check_start ) splice->csq |= CSQ_START_LOST; }
2058 else { if ( splice->check_stop ) splice->csq |= CSQ_STOP_LOST; }
2059 }
2060 if ( splice->set_refalt )
2061 {
2062 if ( splice->tbeg>0 ) splice->tbeg--; //why is this?
2063 if ( splice->vcf.rlen > splice->tbeg + splice->tend && splice->vcf.alen > splice->tbeg + splice->tend )
2064 {
2065 splice->vcf.rlen -= splice->tbeg + splice->tend;
2066 splice->vcf.alen -= splice->tbeg + splice->tend;
2067 }
2068 splice->kref.l = 0; kputsn(splice->vcf.ref + splice->tbeg, splice->vcf.rlen, &splice->kref);
2069 splice->kalt.l = 0; kputsn(splice->vcf.alt + splice->tbeg, splice->vcf.alen, &splice->kalt);
2070 if ( (splice->ref_beg+1 < ex_beg && splice->ref_end >= ex_beg) || (splice->ref_beg+1 < ex_end && splice->ref_end >= ex_end) ) // ouch, ugly ENST00000409523/long-overlapping-del.vcf
2071 {
2072 splice->csq |= (splice->ref_end - splice->ref_beg)%3 ? CSQ_FRAMESHIFT_VARIANT : CSQ_INFRAME_DELETION;
2073 return SPLICE_OVERLAP;
2074 }
2075 }
2076 csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq, splice->vcf.ial);
2077 return SPLICE_INSIDE;
2078 }
2079
splice_csq_mnp(args_t * args,splice_t * splice,uint32_t ex_beg,uint32_t ex_end)2080 static inline int splice_csq_mnp(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end)
2081 {
2082 // not a real variant, can be ignored: eg ACGT>ACGT
2083 if ( splice->tbeg + splice->tend == splice->vcf.rlen ) return SPLICE_VAR_REF;
2084
2085 splice->ref_beg = splice->vcf.pos + splice->tbeg;
2086 splice->ref_end = splice->vcf.pos + splice->vcf.rlen - splice->tend - 1;
2087
2088 #if XDBG
2089 fprintf(stderr,"mnp: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_utr=%d start,stop,beg,end=%d,%d,%d,%d\n", splice->vcf.ref,splice->vcf.alt,ex_beg,ex_end,splice->ref_beg,splice->ref_end,splice->tbeg,splice->tend,splice->check_utr,splice->check_start,splice->check_stop,splice->check_region_beg,splice->check_region_end);
2090 #endif
2091
2092 if ( splice->ref_beg < ex_beg ) // the part before the exon
2093 {
2094 if ( splice->check_region_beg )
2095 {
2096 int csq = 0;
2097 if ( splice->check_utr )
2098 {
2099 regitr_t *itr = regitr_init(NULL);
2100 const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
2101 if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg,ex_beg-1, itr) ) // adjacent utr
2102 csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
2103 regitr_destroy(itr);
2104 }
2105 if ( !csq )
2106 {
2107 if ( splice->ref_end >= ex_beg - N_SPLICE_REGION_INTRON && splice->ref_beg < ex_beg - N_SPLICE_DONOR )
2108 splice->csq |= CSQ_SPLICE_REGION;
2109 if ( splice->ref_end >= ex_beg - N_SPLICE_DONOR )
2110 {
2111 if ( splice->check_donor && splice->tr->strand==STRAND_REV ) splice->csq |= CSQ_SPLICE_DONOR;
2112 if ( splice->check_acceptor && splice->tr->strand==STRAND_FWD ) splice->csq |= CSQ_SPLICE_ACCEPTOR;
2113 }
2114 }
2115 }
2116 if ( splice->ref_end >= ex_beg )
2117 {
2118 splice->tbeg = splice->ref_beg - splice->vcf.pos;
2119 splice->ref_beg = ex_beg;
2120 }
2121 }
2122 if ( ex_end < splice->ref_end ) // the part after the exon
2123 {
2124 if ( splice->check_region_end )
2125 {
2126 int csq = 0;
2127 if ( splice->check_utr )
2128 {
2129 regitr_t *itr = regitr_init(NULL);
2130 const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
2131 if ( regidx_overlap(args->idx_utr,chr,ex_end+1,splice->ref_end, itr) ) // adjacent utr
2132 csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
2133 regitr_destroy(itr);
2134 }
2135 if ( !csq )
2136 {
2137 if ( splice->ref_beg <= ex_end + N_SPLICE_REGION_INTRON && splice->ref_end > ex_end + N_SPLICE_DONOR )
2138 splice->csq |= CSQ_SPLICE_REGION;
2139 if ( splice->ref_beg <= ex_end + N_SPLICE_DONOR )
2140 {
2141 if ( splice->check_donor && splice->tr->strand==STRAND_FWD ) splice->csq |= CSQ_SPLICE_DONOR;
2142 if ( splice->check_acceptor && splice->tr->strand==STRAND_REV ) splice->csq |= CSQ_SPLICE_ACCEPTOR;
2143 }
2144 }
2145 }
2146 if ( splice->ref_beg <= ex_end )
2147 {
2148 splice->tend = splice->vcf.rlen - (splice->ref_end - splice->vcf.pos + 1);
2149 splice->ref_end = ex_end;
2150 }
2151 }
2152 if ( splice->ref_end < ex_beg || splice->ref_beg > ex_end )
2153 {
2154 csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq, splice->vcf.ial);
2155 return SPLICE_OUTSIDE;
2156 }
2157
2158 if ( splice->ref_beg < ex_beg + 3 )
2159 {
2160 if ( splice->check_region_beg ) splice->csq |= CSQ_SPLICE_REGION;
2161 if ( splice->tr->strand==STRAND_FWD ) { if ( splice->check_start ) splice->csq |= CSQ_START_LOST; }
2162 else { if ( splice->check_stop ) splice->csq |= CSQ_STOP_LOST; }
2163 }
2164 if ( splice->ref_end > ex_end - 3 )
2165 {
2166 if ( splice->check_region_end ) splice->csq |= CSQ_SPLICE_REGION;
2167 if ( splice->tr->strand==STRAND_REV ) { if ( splice->check_start ) splice->csq |= CSQ_START_LOST; }
2168 else { if ( splice->check_stop ) splice->csq |= CSQ_STOP_LOST; }
2169 }
2170 if ( splice->set_refalt )
2171 {
2172 splice->vcf.rlen -= splice->tbeg + splice->tend;
2173 splice->kref.l = 0; kputsn(splice->vcf.ref + splice->tbeg, splice->vcf.rlen, &splice->kref);
2174 splice->kalt.l = 0; kputsn(splice->vcf.alt + splice->tbeg, splice->vcf.rlen, &splice->kalt);
2175 }
2176 csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq, splice->vcf.ial);
2177 return SPLICE_INSIDE;
2178 }
splice_csq(args_t * args,splice_t * splice,uint32_t ex_beg,uint32_t ex_end)2179 static inline int splice_csq(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end)
2180 {
2181 splice->vcf.alen = strlen(splice->vcf.alt);
2182
2183 int rlen1 = splice->vcf.rlen - 1, alen1 = splice->vcf.alen - 1, i = 0;
2184 splice->tbeg = 0, splice->tend = 0;
2185
2186 // trim from the right, then from the left
2187 while ( i<=rlen1 && i<=alen1 )
2188 {
2189 if ( splice->vcf.ref[rlen1-i] != splice->vcf.alt[alen1-i] ) break;
2190 i++;
2191 }
2192 splice->tend = i;
2193 rlen1 -= i, alen1 -= i, i = 0;
2194 while ( i<=rlen1 && i<=alen1 )
2195 {
2196 if ( splice->vcf.ref[i] != splice->vcf.alt[i] ) break;
2197 i++;
2198 }
2199 splice->tbeg = i;
2200
2201 // The mnp, ins and del code was split into near-identical functions for clarity and debugging;
2202 // possible todo: generalize once stable
2203 if ( splice->vcf.rlen==splice->vcf.alen ) return splice_csq_mnp(args, splice, ex_beg, ex_end);
2204 if ( splice->vcf.rlen < splice->vcf.alen ) return splice_csq_ins(args, splice, ex_beg, ex_end);
2205 if ( splice->vcf.rlen > splice->vcf.alen ) return splice_csq_del(args, splice, ex_beg, ex_end);
2206
2207 return 0;
2208 }
2209
2210
2211 // return value: 0 added, 1 overlapping variant, 2 silent discard (intronic,alt=ref)
hap_init(args_t * args,hap_node_t * parent,hap_node_t * child,gf_cds_t * cds,bcf1_t * rec,int ial)2212 int hap_init(args_t *args, hap_node_t *parent, hap_node_t *child, gf_cds_t *cds, bcf1_t *rec, int ial)
2213 {
2214 int i;
2215 kstring_t str = {0,0,0};
2216 tscript_t *tr = cds->tr;
2217 child->icds = cds->icds; // index of cds in the tscript's list of exons
2218 child->vcf_ial = ial;
2219
2220 splice_t splice;
2221 splice_init(&splice, rec);
2222 splice.tr = tr;
2223 splice.vcf.ial = ial;
2224 splice.vcf.alt = rec->d.allele[ial];
2225 splice.check_acceptor = splice.check_donor = splice.set_refalt = splice.check_utr = 1;
2226 if ( !(tr->trim & TRIM_5PRIME) )
2227 {
2228 if ( tr->strand==STRAND_FWD ) { if ( child->icds==0 ) splice.check_start = 1; }
2229 else { if ( child->icds==tr->ncds-1 ) splice.check_start = 1; }
2230 }
2231 if ( !(tr->trim & TRIM_3PRIME) )
2232 {
2233 if ( tr->strand==STRAND_FWD ) { if ( child->icds==tr->ncds-1 ) splice.check_stop = 1; }
2234 else { if ( child->icds==0 ) splice.check_stop = 1; }
2235 }
2236 if ( splice.check_start ) // do not check starts in incomplete CDS, defined as not starting with M
2237 {
2238 if ( tr->strand==STRAND_FWD ) { if ( dna2aa(tr->ref+N_REF_PAD+cds->beg-tr->beg) != 'M' ) splice.check_start = 0; }
2239 else { if ( cdna2aa(tr->ref+N_REF_PAD+cds->beg-tr->beg+cds->len-3) != 'M' ) splice.check_start = 0; }
2240 }
2241 if ( child->icds!=0 ) splice.check_region_beg = 1;
2242 if ( child->icds!=tr->ncds-1 ) splice.check_region_end = 1;
2243
2244 #if XDBG
2245 fprintf(stderr,"\nhap_init: %d [%s][%s] check start:%d,stop:%d\n",splice.vcf.pos+1,splice.vcf.ref,splice.vcf.alt,splice.check_start,splice.check_stop);
2246 #endif
2247 int ret = splice_csq(args, &splice, cds->beg, cds->beg + cds->len - 1);
2248 #if XDBG
2249 fprintf(stderr,"cds splice_csq: %d [%s][%s] .. beg,end=%d %d, ret=%d, csq=%d\n\n",splice.vcf.pos+1,splice.kref.s,splice.kalt.s,splice.ref_beg+1,splice.ref_end+1,ret,splice.csq);
2250 #endif
2251
2252 if ( ret==SPLICE_VAR_REF ) return 2; // not a variant, eg REF=CA ALT=CA
2253 if ( ret==SPLICE_OUTSIDE || ret==SPLICE_OVERLAP || splice.csq==CSQ_START_LOST ) // not a coding csq
2254 {
2255 free(splice.kref.s);
2256 free(splice.kalt.s);
2257
2258 if ( !splice.csq ) return 2; // fully intronic, no csq
2259
2260 // splice_region/acceptor/donor
2261 child->seq = NULL;
2262 child->sbeg = 0;
2263 child->rbeg = rec->pos;
2264 child->rlen = 0;
2265 child->dlen = 0;
2266 kputs(rec->d.allele[0],&str);
2267 kputc('>',&str);
2268 kputs(rec->d.allele[ial],&str);
2269 child->var = str.s;
2270 child->type = HAP_SSS;
2271 child->csq = splice.csq;
2272 child->rec = rec;
2273 return 0;
2274 }
2275 if ( splice.csq & CSQ_SYNONYMOUS_VARIANT ) splice.csq &= ~CSQ_SYNONYMOUS_VARIANT; // synonymous&splice,frame could become synonymous&frame,splice
2276
2277 int dbeg = 0;
2278 if ( splice.ref_beg < cds->beg )
2279 {
2280 // The vcf record overlaps the exon boundary, but the variant itself
2281 // should fit inside since we are here. This will need more work.
2282 // #1475227917
2283 dbeg = cds->beg - splice.ref_beg;
2284 splice.kref.l -= dbeg;
2285 splice.ref_beg = cds->beg;
2286 assert( dbeg <= splice.kalt.l );
2287 }
2288
2289 assert( parent->type!=HAP_SSS );
2290 if ( parent->type==HAP_CDS )
2291 {
2292 i = parent->icds;
2293 if ( i!=cds->icds )
2294 {
2295 // the variant is on a new exon, finish up the previous
2296 int len = tr->cds[i]->len - parent->rbeg - parent->rlen + tr->cds[i]->beg;
2297 if ( len > 0 )
2298 kputsn_(tr->ref + N_REF_PAD + parent->rbeg + parent->rlen - tr->beg, len, &str);
2299 }
2300
2301 // append any skipped non-variant exons
2302 while ( ++i < cds->icds )
2303 kputsn_(tr->ref + N_REF_PAD + tr->cds[i]->beg - tr->beg, tr->cds[i]->len, &str);
2304
2305 if ( parent->icds==child->icds )
2306 {
2307 int len = splice.ref_beg - parent->rbeg - parent->rlen;
2308 if ( len < 0 ) // overlapping variants
2309 {
2310 free(str.s);
2311 free(splice.kref.s);
2312 free(splice.kalt.s);
2313 return 1;
2314 }
2315 kputsn_(tr->ref + N_REF_PAD + parent->rbeg + parent->rlen - tr->beg, len, &str);
2316 }
2317 else
2318 kputsn_(tr->ref + N_REF_PAD + cds->beg - tr->beg, splice.ref_beg - cds->beg, &str);
2319 }
2320 kputs(splice.kalt.s + dbeg, &str);
2321
2322 child->seq = str.s;
2323 child->sbeg = cds->pos + (splice.ref_beg - cds->beg);
2324 child->rbeg = splice.ref_beg;
2325 child->rlen = splice.kref.l;
2326 child->type = HAP_CDS;
2327 child->prev = parent;
2328 child->rec = rec;
2329 child->csq = splice.csq;
2330
2331 // set vlen and the "ref>alt" string
2332 {
2333 int rlen = strlen(rec->d.allele[0]);
2334 int alen = strlen(rec->d.allele[ial]);
2335 child->dlen = alen - rlen;
2336 child->var = (char*) malloc(rlen+alen+2);
2337 memcpy(child->var,rec->d.allele[0],rlen);
2338 child->var[rlen] = '>';
2339 memcpy(child->var+rlen+1,rec->d.allele[ial],alen);
2340 child->var[rlen+alen+1] = 0;
2341 }
2342
2343 // yuck, the whole CDS is modified/deleted, not ready for this, todo.
2344 if ( child->rbeg + child->rlen > cds->beg + cds->len )
2345 {
2346 child->type = HAP_SSS;
2347 if ( !child->csq ) child->csq |= CSQ_CODING_SEQUENCE; // hack, specifically for ENST00000390520/deletion-overlap.vcf
2348 }
2349
2350
2351 free(splice.kref.s);
2352 free(splice.kalt.s);
2353 return 0;
2354 }
hap_destroy(hap_node_t * hap)2355 void hap_destroy(hap_node_t *hap)
2356 {
2357 int i;
2358 for (i=0; i<hap->nchild; i++)
2359 if ( hap->child[i] ) hap_destroy(hap->child[i]);
2360 for (i=0; i<hap->mcsq_list; i++) free(hap->csq_list[i].type.vstr.s);
2361 free(hap->csq_list);
2362 free(hap->child);
2363 free(hap->cur_child);
2364 free(hap->seq);
2365 free(hap->var);
2366 free(hap);
2367 }
2368
2369
2370 /*
2371 ref: spliced reference and its length (ref.l)
2372 seq: part of the spliced query transcript on the reference strand to translate, its
2373 length (seq.l) and the total length of the complete transcript (seq.m)
2374 sbeg: seq offset within the spliced query transcript
2375 rbeg: seq offset within ref, 0-based
2376 rend: last base of seq within ref, plus one. If seq does not contain indels, it is rend=rbeg+seq->l
2377 strand: coding strand - 0:rev, 1:fwd
2378 tseq: translated sequence (aa)
2379 fill: frameshift, fill until the end (strand=fwd) or from the start (strand=rev)
2380 */
cds_translate(kstring_t * _ref,kstring_t * _seq,uint32_t sbeg,uint32_t rbeg,uint32_t rend,int strand,kstring_t * tseq,int fill)2381 void cds_translate(kstring_t *_ref, kstring_t *_seq, uint32_t sbeg, uint32_t rbeg, uint32_t rend, int strand, kstring_t *tseq, int fill)
2382 {
2383 #if XDBG
2384 fprintf(stderr,"\ntranslate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,rend,fill,(int)_seq->l);
2385 #endif
2386 char tmp[3], *codon, *end;
2387 int i, len, npad;
2388
2389 kstring_t ref = *_ref;
2390 kstring_t seq = *_seq;
2391
2392 tseq->l = 0;
2393 if ( !seq.l )
2394 {
2395 kputc('?', tseq);
2396 return;
2397 }
2398
2399 #define DBG 0
2400 #if DBG
2401 fprintf(stderr,"translate: sbeg,rbeg,rend=%d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,rend,fill,(int)_seq->l);
2402 fprintf(stderr," ref: l=%d %s\n", (int)ref.l,ref.s);
2403 fprintf(stderr," seq: l=%d m=%d ", (int)seq.l,(int)seq.m);
2404 for (i=0; i<seq.l; i++) fprintf(stderr,"%c",seq.s[i]); fprintf(stderr,"\n");
2405 fprintf(stderr," sbeg,rbeg,rend: %d,%d,%d\n", sbeg,rbeg,rend);
2406 fprintf(stderr," strand,fill: %d,%d\n", strand,fill);
2407 #endif
2408
2409 if ( strand==STRAND_FWD )
2410 {
2411 // left padding
2412 npad = sbeg % 3;
2413 #if DBG>1
2414 fprintf(stderr," npad: %d\n",npad);
2415 #endif
2416 assert( npad<=rbeg );
2417
2418 for (i=0; i<npad; i++)
2419 tmp[i] = ref.s[rbeg+i-npad+N_REF_PAD];
2420 for (; i<3 && i-npad<seq.l; i++)
2421 tmp[i] = seq.s[i-npad];
2422 len = seq.l - i + npad; // the remaining length of padded sseq
2423 #if DBG>1
2424 fprintf(stderr,"\t i=%d\n", i);
2425 #endif
2426 if ( i==3 )
2427 {
2428 kputc_(dna2aa(tmp), tseq);
2429 #if DBG>1
2430 fprintf(stderr,"[1]%c%c%c\n",tmp[0],tmp[1],tmp[2]);
2431 #endif
2432 codon = seq.s + 3 - npad; // next codon
2433 end = codon + len - 1 - (len % 3); // last position of a valid codon
2434 while ( codon < end )
2435 {
2436 kputc_(dna2aa(codon), tseq);
2437 #if DBG>1
2438 fprintf(stderr,"[2]%c%c%c\n",codon[0],codon[1],codon[2]);
2439 #endif
2440 codon += 3;
2441 }
2442 end = seq.s + seq.l - 1;
2443 for (i=0; codon+i<=end; i++) tmp[i] = codon[i];
2444 }
2445
2446 // right padding
2447 codon = ref.s + rend + N_REF_PAD;
2448 if ( i>0 )
2449 {
2450 #if DBG>1
2451 if(i==1)fprintf(stderr,"[3]%c\n",tmp[0]);
2452 if(i==2)fprintf(stderr,"[3]%c%c\n",tmp[0],tmp[1]);
2453 #endif
2454 for (; i<3; i++)
2455 {
2456 tmp[i] = *codon;
2457 codon++;
2458 }
2459 kputc_(dna2aa(tmp), tseq);
2460 #if DBG>1
2461 fprintf(stderr,"[4]%c%c%c\n",tmp[0],tmp[1],tmp[2]);
2462 #endif
2463 }
2464 if ( fill!=0 )
2465 {
2466 end = ref.s + ref.l - N_REF_PAD;
2467 while ( codon+3 <= end )
2468 {
2469 kputc_(dna2aa(codon), tseq);
2470 #if DBG>1
2471 fprintf(stderr,"[5]%c%c%c\t%c\n",codon[0],codon[1],codon[2],dna2aa(codon));
2472 #endif
2473 codon += 3;
2474 }
2475 }
2476 }
2477 else // STRAND_REV
2478 {
2479 // right padding - number of bases to take from ref
2480 npad = (seq.m - (sbeg + seq.l)) % 3;
2481 #if DBG>1
2482 fprintf(stderr," npad: %d\n",npad);
2483 #endif
2484 if ( !(npad>=0 && sbeg+seq.l+npad<=seq.m) ) fprintf(stderr,"sbeg=%d seq.l=%d seq.m=%d npad=%d\n",sbeg,(int)seq.l,(int)seq.m,npad);
2485 assert( npad>=0 && sbeg+seq.l+npad<=seq.m ); // todo: first codon on the rev strand
2486
2487 if ( npad==2 )
2488 {
2489 tmp[1] = ref.s[rend+N_REF_PAD];
2490 tmp[2] = ref.s[rend+N_REF_PAD+1];
2491 i = 0;
2492 }
2493 else if ( npad==1 )
2494 {
2495 tmp[2] = ref.s[rend+N_REF_PAD];
2496 i = 1;
2497 }
2498 else
2499 i = 2;
2500
2501 end = seq.s + seq.l;
2502 for (; i>=0 && end>seq.s; i--) tmp[i] = *(--end);
2503 #if DBG>1
2504 fprintf(stderr,"\t i=%d\n", i);
2505 if(i==1)fprintf(stderr,"[0] %c\n",tmp[2]);
2506 if(i==0)fprintf(stderr,"[0] %c%c\n",tmp[1],tmp[2]);
2507 #endif
2508 if ( i==-1 )
2509 {
2510 #if DBG>1
2511 fprintf(stderr,"[1]%c%c%c\t%c\n",tmp[0],tmp[1],tmp[2], cdna2aa(tmp));
2512 #endif
2513 kputc_(cdna2aa(tmp), tseq);
2514 codon = end - 3;
2515 while ( codon >= seq.s )
2516 {
2517 kputc_(cdna2aa(codon), tseq);
2518 #if DBG>1
2519 fprintf(stderr,"[2]%c%c%c\t%c\n",codon[0],codon[1],codon[2], cdna2aa(codon));
2520 #endif
2521 codon -= 3;
2522 }
2523 if ( seq.s-codon==2 )
2524 {
2525 tmp[2] = seq.s[0];
2526 i = 1;
2527 }
2528 else if ( seq.s-codon==1 )
2529 {
2530 tmp[1] = seq.s[0];
2531 tmp[2] = seq.s[1];
2532 i = 0;
2533 }
2534 else
2535 i = -1;
2536 #if DBG>1
2537 if(i==1)fprintf(stderr,"[3] %c\n",tmp[2]);
2538 if(i==0)fprintf(stderr,"[3] %c%c\n",tmp[1],tmp[2]);
2539 #endif
2540 }
2541 // left padding
2542 end = ref.s + N_REF_PAD + rbeg;
2543 if ( i>=0 )
2544 {
2545 for (; i>=0 && end>=ref.s; i--) tmp[i] = *(--end);
2546 kputc_(cdna2aa(tmp), tseq);
2547 #if DBG>1
2548 fprintf(stderr,"[4]%c%c%c\t%c\n",tmp[0],tmp[1],tmp[2],cdna2aa(tmp));
2549 #endif
2550 }
2551 if ( fill!=0 )
2552 {
2553 codon = end - 3;
2554 while ( codon >= ref.s + N_REF_PAD )
2555 {
2556 kputc_(cdna2aa(codon), tseq);
2557 #if DBG>1
2558 fprintf(stderr,"[5]%c%c%c\t%c\n",codon[0],codon[1],codon[2],cdna2aa(codon));
2559 #endif
2560 codon -= 3;
2561 }
2562 }
2563 }
2564 kputc_(0,tseq); tseq->l--;
2565 #if DBG
2566 fprintf(stderr," tseq: %s\n", tseq->s);
2567 #endif
2568 }
2569
tscript_splice_ref(tscript_t * tr)2570 void tscript_splice_ref(tscript_t *tr)
2571 {
2572 int i, len = 0;
2573 for (i=0; i<tr->ncds; i++)
2574 len += tr->cds[i]->len;
2575
2576 tr->nsref = len + 2*N_REF_PAD;
2577 tr->sref = (char*) malloc(len + 1 + 2*N_REF_PAD);
2578 len = 0;
2579
2580 memcpy(tr->sref, tr->ref + tr->cds[0]->beg - tr->beg, N_REF_PAD);
2581 len += N_REF_PAD;
2582
2583 for (i=0; i<tr->ncds; i++)
2584 {
2585 memcpy(tr->sref + len, tr->ref + N_REF_PAD + tr->cds[i]->beg - tr->beg, tr->cds[i]->len);
2586 len += tr->cds[i]->len;
2587 }
2588 memcpy(tr->sref + len, tr->ref + N_REF_PAD + tr->cds[tr->ncds-1]->beg - tr->beg, N_REF_PAD);
2589 len += N_REF_PAD;
2590
2591 tr->sref[len] = 0;
2592 }
2593
2594 // returns: 0 if consequence was added, 1 if it already exists or could not be added
csq_push(args_t * args,csq_t * csq,bcf1_t * rec)2595 int csq_push(args_t *args, csq_t *csq, bcf1_t *rec)
2596 {
2597 #if XDBG
2598 fprintf(stderr,"csq_push: %d .. %d\n",rec->pos+1,csq->type.type);
2599 #endif
2600 khint_t k = kh_get(pos2vbuf, args->pos2vbuf, (int)csq->pos);
2601 vbuf_t *vbuf = (k == kh_end(args->pos2vbuf)) ? NULL : kh_val(args->pos2vbuf, k);
2602 if ( !vbuf ) error("This should not happen. %s:%d %s\n",bcf_seqname(args->hdr,rec),csq->pos+1,csq->type.vstr.s);
2603
2604 int i;
2605 for (i=0; i<vbuf->n; i++)
2606 if ( vbuf->vrec[i]->line==rec ) break;
2607 if ( i==vbuf->n ) error("This should not happen.. %s:%d %s\n", bcf_seqname(args->hdr,rec),csq->pos+1,csq->type.vstr.s);
2608 vrec_t *vrec = vbuf->vrec[i];
2609
2610 // if the variant overlaps donor/acceptor and also splice region, report only donor/acceptor
2611 if ( csq->type.type & CSQ_SPLICE_REGION && csq->type.type & (CSQ_SPLICE_DONOR|CSQ_SPLICE_ACCEPTOR) )
2612 csq->type.type &= ~CSQ_SPLICE_REGION;
2613
2614 if ( csq->type.type & CSQ_PRINTED_UPSTREAM )
2615 {
2616 for (i=0; i<vrec->nvcsq; i++)
2617 {
2618 // Same as below, to avoid records like
2619 // 3630 .. @3632,stop_lost|AL627309.1|ENST00000423372|protein_coding|-
2620 // 3632 .. stop_lost|AL627309.1|ENST00000423372|protein_coding|-|260*>260G|3630T>A+3632A>C
2621 if ( csq->type.type&CSQ_START_STOP && vrec->vcsq[i].type&CSQ_START_STOP )
2622 {
2623 vrec->vcsq[i] = csq->type;
2624 goto exit_duplicate;
2625 }
2626 if ( !(vrec->vcsq[i].type & CSQ_PRINTED_UPSTREAM) ) continue;
2627 if ( csq->type.ref != vrec->vcsq[i].ref ) continue;
2628 goto exit_duplicate;
2629 }
2630 }
2631 else if ( csq->type.type & CSQ_COMPOUND )
2632 {
2633 for (i=0; i<vrec->nvcsq; i++)
2634 {
2635 if ( csq->type.trid != vrec->vcsq[i].trid && (csq->type.type|vrec->vcsq[i].type)&CSQ_PRN_TSCRIPT ) continue;
2636 if ( csq->type.biotype != vrec->vcsq[i].biotype ) continue;
2637 if ( csq->type.gene != vrec->vcsq[i].gene ) continue;
2638 if ( csq->type.vcf_ial != vrec->vcsq[i].vcf_ial ) continue;
2639 if ( (csq->type.type&CSQ_UPSTREAM_STOP)^(vrec->vcsq[i].type&CSQ_UPSTREAM_STOP) ) continue; // both must or mustn't have upstream_stop
2640 if ( csq->type.vstr.s || vrec->vcsq[i].vstr.s )
2641 {
2642 // This is a bit hacky, but we want a simpler and more predictable output. The splice_csq() function
2643 // can trigger stop/start events based on indel overlap, then another stop/start event can be triggered
2644 // from add_csq() or test_cds_local() based on sequence comparison, and on output we could find two
2645 // consequences:
2646 // stop_lost|AL627309.1|ENST00000423372|protein_coding|-
2647 // stop_lost&inframe_insertion|AL627309.1|ENST00000423372|protein_coding|-|260*>260CL|3630T>TAAA
2648 if ( !csq->type.vstr.s || !vrec->vcsq[i].vstr.s )
2649 {
2650 if ( csq->type.type&CSQ_START_STOP && vrec->vcsq[i].type&CSQ_START_STOP )
2651 {
2652 vrec->vcsq[i].type |= csq->type.type;
2653
2654 // remove stop_lost&synonymous if stop_retained set
2655 if ( vrec->vcsq[i].type&CSQ_STOP_RETAINED )
2656 vrec->vcsq[i].type &= ~(CSQ_STOP_LOST|CSQ_SYNONYMOUS_VARIANT);
2657
2658 if ( !vrec->vcsq[i].vstr.s ) vrec->vcsq[i].vstr = csq->type.vstr;
2659 goto exit_duplicate;
2660 }
2661 continue;
2662 }
2663 if ( strcmp(csq->type.vstr.s,vrec->vcsq[i].vstr.s) ) continue;
2664 }
2665 vrec->vcsq[i].type |= csq->type.type;
2666 goto exit_duplicate;
2667 }
2668 }
2669 else
2670 {
2671 for (i=0; i<vrec->nvcsq; i++)
2672 {
2673 if ( csq->type.trid != vrec->vcsq[i].trid && (csq->type.type|vrec->vcsq[i].type)&CSQ_PRN_TSCRIPT) continue;
2674 if ( csq->type.biotype != vrec->vcsq[i].biotype ) continue;
2675 if ( !(vrec->vcsq[i].type & CSQ_COMPOUND) )
2676 {
2677 vrec->vcsq[i].type |= csq->type.type;
2678 goto exit_duplicate;
2679 }
2680 if ( vrec->vcsq[i].type==(vrec->vcsq[i].type|csq->type.type) ) goto exit_duplicate;
2681 }
2682 }
2683 // no such csq yet in this vcf record
2684 csq->vrec = vrec;
2685 csq->idx = vrec->nvcsq++;
2686 hts_expand0(vcsq_t, vrec->nvcsq, vrec->mvcsq, vrec->vcsq);
2687 vrec->vcsq[i] = csq->type;
2688 return 0;
2689
2690 exit_duplicate:
2691 csq->vrec = vrec;
2692 csq->idx = i;
2693 return 1;
2694 }
2695
2696 // soff .. position of the variant within the trimmed query transcript
2697 // sbeg .. position of the variant within the query transcript
2698 // rbeg .. position on the reference transcript (if there are no indels, then rbeg=send)
2699 // rpos .. VCF position
2700 #define node2soff(i) (hap->stack[i].slen - (hap->stack[i].node->rlen + hap->stack[i].node->dlen))
2701 #define node2sbeg(i) (hap->sbeg + node2soff(i))
2702 #define node2send(i) (hap->sbeg + hap->stack[i].slen)
2703 #define node2rbeg(i) (hap->stack[i].node->sbeg)
2704 #define node2rend(i) (hap->stack[i].node->sbeg + hap->stack[i].node->rlen)
2705 #define node2rpos(i) (hap->stack[i].node->rec->pos)
2706
kput_vcsq(args_t * args,vcsq_t * csq,kstring_t * str)2707 void kput_vcsq(args_t *args, vcsq_t *csq, kstring_t *str)
2708 {
2709 // Remove start/stop from incomplete CDS, but only if there is another
2710 // consequence as something must be reported
2711 if ( csq->type & CSQ_INCOMPLETE_CDS && (csq->type & ~(CSQ_START_STOP|CSQ_INCOMPLETE_CDS|CSQ_UPSTREAM_STOP)) ) csq->type &= ~(CSQ_START_STOP|CSQ_INCOMPLETE_CDS);
2712
2713 // Remove missense from start/stops
2714 if ( csq->type & CSQ_START_STOP && csq->type & CSQ_MISSENSE_VARIANT ) csq->type &= ~CSQ_MISSENSE_VARIANT;
2715
2716 if ( csq->type & CSQ_PRINTED_UPSTREAM && csq->ref )
2717 {
2718 kputc_('@',str);
2719 kputw(csq->ref->pos+1, str);
2720 return;
2721 }
2722 if ( csq->type & CSQ_UPSTREAM_STOP )
2723 kputc_('*',str);
2724
2725 int i, n = sizeof(csq_strings)/sizeof(char*);
2726 for (i=1; i<n; i++)
2727 if ( csq_strings[i] && csq->type&(1<<i) ) { kputs(csq_strings[i],str); break; }
2728 i++;
2729 for (; i<n; i++)
2730 if ( csq_strings[i] && csq->type&(1<<i) ) { kputc_('&',str); kputs(csq_strings[i],str); }
2731
2732 kputc_('|', str);
2733 if ( csq->gene ) kputs(csq->gene , str);
2734
2735 kputc_('|', str);
2736 if ( csq->type & CSQ_PRN_TSCRIPT ) kputs(args->tscript_ids.str[csq->trid], str);
2737
2738 kputc_('|', str);
2739 kputs(gf_type2gff_string(csq->biotype), str);
2740
2741 if ( CSQ_PRN_STRAND(csq->type) || csq->vstr.l )
2742 kputs(csq->strand==STRAND_FWD ? "|+" : "|-", str);
2743
2744 if ( csq->vstr.l )
2745 kputs(csq->vstr.s, str);
2746 }
2747
kprint_aa_prediction(args_t * args,int beg,kstring_t * aa,kstring_t * str)2748 void kprint_aa_prediction(args_t *args, int beg, kstring_t *aa, kstring_t *str)
2749 {
2750 if ( !args->brief_predictions || (int)aa->l - args->brief_predictions < 3 )
2751 kputs(aa->s, str);
2752 else
2753 {
2754 int i, len = aa->l;
2755 if ( aa->s[len-1]=='*' ) len--;
2756 for (i=0; i<len && i<args->brief_predictions; i++) kputc(aa->s[i], str);
2757 kputs("..", str);
2758 kputw(beg+len, str);
2759 }
2760 }
2761
hap_add_csq(args_t * args,hap_t * hap,hap_node_t * node,int tlen,int ibeg,int iend,int dlen,int indel)2762 void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg, int iend, int dlen, int indel)
2763 {
2764 int i;
2765 tscript_t *tr = hap->tr;
2766 int ref_node = tr->strand==STRAND_FWD ? ibeg : iend;
2767 int icsq = node->ncsq_list++;
2768 hts_expand0(csq_t,node->ncsq_list,node->mcsq_list,node->csq_list);
2769 csq_t *csq = &node->csq_list[icsq];
2770 csq->pos = hap->stack[ref_node].node->rec->pos;
2771 csq->type.trid = tr->id;
2772 csq->type.vcf_ial = node->vcf_ial;
2773 csq->type.gene = tr->gene->name;
2774 csq->type.strand = tr->strand;
2775 csq->type.biotype = tr->type;
2776
2777 // only now we see the translated sequence and can determine if the stop/start changes are real
2778 int rm_csq = 0;
2779 csq->type.type = 0;
2780 for (i=ibeg; i<=iend; i++)
2781 csq->type.type |= hap->stack[i].node->csq & CSQ_COMPOUND;
2782 if ( dlen==0 && indel ) csq->type.type |= CSQ_INFRAME_ALTERING;
2783
2784 int has_upstream_stop = hap->upstream_stop;
2785 if ( hap->stack[ibeg].node->type != HAP_SSS )
2786 {
2787 // check for truncating stops
2788 for (i=0; i<hap->tref.l; i++)
2789 if ( hap->tref.s[i]=='*' ) break;
2790 if ( i!=hap->tref.l )
2791 {
2792 hap->tref.l = i+1;
2793 hap->tref.s[i+1] = 0;
2794 }
2795 for (i=0; i<hap->tseq.l; i++)
2796 if ( hap->tseq.s[i]=='*' ) break;
2797 if ( i!=hap->tseq.l )
2798 {
2799 hap->tseq.l = i+1;
2800 hap->tseq.s[i+1] = 0;
2801 hap->upstream_stop = 1;
2802 }
2803 if ( csq->type.type & CSQ_STOP_LOST )
2804 {
2805 if ( hap->tref.s[hap->tref.l-1]=='*' && hap->tref.s[hap->tref.l-1] == hap->tseq.s[hap->tseq.l-1] )
2806 {
2807 rm_csq |= CSQ_STOP_LOST;
2808 csq->type.type |= CSQ_STOP_RETAINED;
2809 }
2810 else if ( hap->tref.s[hap->tref.l-1]!='*' )
2811 {
2812 // This is CDS 3' incomplete ENSG00000173376/synon.vcf, can also be missense
2813 // We observe in real data a change to a stop, ENST00000528237/retained-stop-incomplete-cds.vcf
2814 if ( hap->tseq.s[hap->tseq.l-1] == '*' )
2815 {
2816 rm_csq |= CSQ_STOP_GAINED;
2817 csq->type.type |= CSQ_STOP_RETAINED;
2818 }
2819 else
2820 csq->type.type |= CSQ_INCOMPLETE_CDS;
2821 }
2822 }
2823 if ( csq->type.type & CSQ_START_LOST && hap->tref.s[0]!='M' )
2824 {
2825 rm_csq |= CSQ_START_LOST;
2826 csq->type.type &= ~CSQ_START_LOST;
2827 }
2828 if ( dlen!=0 )
2829 {
2830 if ( dlen%3 )
2831 csq->type.type |= CSQ_FRAMESHIFT_VARIANT;
2832 else if ( dlen<0 )
2833 csq->type.type |= CSQ_INFRAME_DELETION;
2834 else
2835 csq->type.type |= CSQ_INFRAME_INSERTION;
2836 if ( hap->tref.s[hap->tref.l-1]!='*' && hap->tseq.s[hap->tseq.l-1]=='*' )
2837 csq->type.type |= CSQ_STOP_GAINED;
2838 }
2839 else
2840 {
2841 for (i=0; i<hap->tref.l; i++)
2842 if ( hap->tref.s[i] != hap->tseq.s[i] ) break;
2843 if ( i==hap->tref.l )
2844 csq->type.type |= CSQ_SYNONYMOUS_VARIANT;
2845 else if ( hap->tref.s[i] == '*' )
2846 csq->type.type |= CSQ_STOP_LOST;
2847 else if ( hap->tseq.s[i] == '*' )
2848 csq->type.type |= CSQ_STOP_GAINED;
2849 else
2850 csq->type.type |= CSQ_MISSENSE_VARIANT;
2851 }
2852 }
2853 // Check if compound inframe variants are real inframes, or if the stop codon occurs before the frameshift can be restored
2854 if ( ibeg!=iend && (csq->type.type & (CSQ_INFRAME_DELETION|CSQ_INFRAME_INSERTION|CSQ_INFRAME_ALTERING)) && hap->tseq.s[hap->tseq.l-1]=='*' )
2855 {
2856 rm_csq |= CSQ_INFRAME_DELETION | CSQ_INFRAME_INSERTION | CSQ_INFRAME_ALTERING;
2857 csq->type.type |= CSQ_FRAMESHIFT_VARIANT | CSQ_STOP_GAINED;
2858 }
2859 if ( has_upstream_stop ) csq->type.type |= CSQ_UPSTREAM_STOP;
2860 csq->type.type &= ~rm_csq;
2861
2862 if ( hap->stack[ibeg].node->type == HAP_SSS )
2863 {
2864 node->csq_list[icsq].type.type |= hap->stack[ibeg].node->csq & ~rm_csq;
2865 node->csq_list[icsq].type.ref = hap->stack[ibeg].node->rec;
2866 node->csq_list[icsq].type.biotype = tr->type;
2867 csq_push(args, node->csq_list+icsq, hap->stack[ibeg].node->rec);
2868 return;
2869 }
2870
2871 kstring_t str = node->csq_list[icsq].type.vstr;
2872 str.l = 0;
2873
2874 // create the aa variant string
2875 int aa_rbeg = tr->strand==STRAND_FWD ? node2rbeg(ibeg)/3+1 : (hap->tr->nsref - 2*N_REF_PAD - node2rend(iend))/3+1;
2876 int aa_sbeg = tr->strand==STRAND_FWD ? node2sbeg(ibeg)/3+1 : (tlen - node2send(iend))/3+1;
2877 kputc_('|', &str);
2878 kputw(aa_rbeg, &str);
2879 kprint_aa_prediction(args,aa_rbeg,&hap->tref,&str);
2880 if ( !(csq->type.type & CSQ_SYNONYMOUS_VARIANT) )
2881 {
2882 kputc_('>', &str);
2883 kputw(aa_sbeg, &str);
2884 kprint_aa_prediction(args,aa_sbeg,&hap->tseq,&str);
2885 }
2886 kputc_('|', &str);
2887
2888 // create the dna variant string and, in case of combined variants,
2889 // insert silent CSQ_PRINTED_UPSTREAM variants
2890 for (i=ibeg; i<=iend; i++)
2891 {
2892 if ( i>ibeg ) kputc_('+', &str);
2893 kputw(node2rpos(i)+1, &str);
2894 kputs(hap->stack[i].node->var, &str);
2895 }
2896 node->csq_list[icsq].type.vstr = str;
2897 csq_push(args, node->csq_list+icsq, hap->stack[ref_node].node->rec);
2898
2899 for (i=ibeg; i<=iend; i++)
2900 {
2901 // csq are printed at one position only for combined variants, the rest is
2902 // silent and references the first
2903 if ( hap->stack[i].node->csq & ~CSQ_COMPOUND )
2904 {
2905 node->ncsq_list++;
2906 hts_expand0(csq_t,node->ncsq_list,node->mcsq_list,node->csq_list);
2907 csq_t *tmp_csq = &node->csq_list[node->ncsq_list - 1];
2908 tmp_csq->pos = hap->stack[i].node->rec->pos;
2909 tmp_csq->type.trid = tr->id;
2910 //??tmp_csq->type.vcf_ial = node->vcf_ial; .. this should not be needed for non-compound variants
2911 tmp_csq->type.gene = tr->gene->name;
2912 tmp_csq->type.strand = tr->strand;
2913 tmp_csq->type.type = hap->stack[i].node->csq & ~CSQ_COMPOUND & ~rm_csq;
2914 tmp_csq->type.biotype = tr->type;
2915 tmp_csq->type.vstr.l = 0;
2916 kputs(str.s,&tmp_csq->type.vstr);
2917 csq_push(args, tmp_csq, hap->stack[i].node->rec);
2918 }
2919 if ( i!=ref_node && (node->csq_list[icsq].type.type & CSQ_COMPOUND || !(hap->stack[i].node->csq & ~CSQ_COMPOUND)) )
2920 {
2921 node->ncsq_list++;
2922 hts_expand0(csq_t,node->ncsq_list,node->mcsq_list,node->csq_list);
2923 csq_t *tmp_csq = &node->csq_list[node->ncsq_list - 1];
2924 tmp_csq->pos = hap->stack[i].node->rec->pos;
2925 tmp_csq->type.trid = tr->id;
2926 //??tmp_csq->type.vcf_ial = node->vcf_ial; .. this should not be needed for non-compound variants
2927 tmp_csq->type.gene = tr->gene->name;
2928 tmp_csq->type.strand = tr->strand;
2929 tmp_csq->type.type = CSQ_PRINTED_UPSTREAM | hap->stack[i].node->csq;
2930 tmp_csq->type.biotype = tr->type;
2931 tmp_csq->type.ref = hap->stack[ref_node].node->rec;
2932 tmp_csq->type.vstr.l = 0;
2933 csq_push(args, tmp_csq, hap->stack[i].node->rec);
2934 }
2935 }
2936 }
2937
2938
hap_finalize(args_t * args,hap_t * hap)2939 void hap_finalize(args_t *args, hap_t *hap)
2940 {
2941 tscript_t *tr = hap->tr;
2942 if ( !tr->sref )
2943 tscript_splice_ref(tr);
2944
2945 kstring_t sref;
2946 sref.s = tr->sref;
2947 sref.l = tr->nsref;
2948 sref.m = sref.l;
2949
2950 int istack = 0;
2951 hts_expand(hstack_t,1,hap->mstack,hap->stack);
2952
2953 hap->sseq.l = 0;
2954 hap->tseq.l = 0;
2955 hap->stack[0].node = tr->root;
2956 hap->stack[0].ichild = -1;
2957 hap->stack[0].slen = 0;
2958 hap->stack[0].dlen = 0;
2959
2960 while ( istack>=0 )
2961 {
2962 hstack_t *stack = &hap->stack[istack];
2963 hap_node_t *node = hap->stack[istack].node;
2964 while ( ++hap->stack[istack].ichild < node->nchild )
2965 {
2966 if ( node->child[stack->ichild] ) break;
2967 }
2968 if ( stack->ichild == node->nchild ) { istack--; continue; }
2969
2970 node = node->child[stack->ichild];
2971
2972 istack++;
2973 hts_expand(hstack_t,istack+1,hap->mstack,hap->stack);
2974 stack = &hap->stack[istack-1];
2975
2976 hap->stack[istack].node = node;
2977 hap->stack[istack].ichild = -1;
2978
2979 hap->sseq.l = stack->slen;
2980 if ( node->type==HAP_CDS ) kputs(node->seq, &hap->sseq);
2981 hap->stack[istack].slen = hap->sseq.l;
2982 hap->stack[istack].dlen = hap->stack[istack-1].dlen + node->dlen;
2983
2984 if ( !node->nend ) continue; // not a leaf node
2985
2986 // The spliced sequence has been built for the current haplotype and stored
2987 // in hap->sseq. Now we break it and output as independent parts
2988
2989 kstring_t sseq;
2990 sseq.m = sref.m - 2*N_REF_PAD + hap->stack[istack].dlen; // total length of the spliced query transcript
2991 hap->upstream_stop = 0;
2992
2993 int i = 1, dlen = 0, ibeg, indel = 0;
2994 hap->sbeg = hap->stack[i].node->sbeg;
2995 assert( hap->stack[istack].node->type != HAP_SSS );
2996 if ( tr->strand==STRAND_FWD )
2997 {
2998 i = 0, ibeg = -1;
2999 while ( ++i <= istack )
3000 {
3001 assert( hap->stack[i].node->type != HAP_SSS );
3002
3003 dlen += hap->stack[i].node->dlen;
3004 if ( hap->stack[i].node->dlen ) indel = 1;
3005
3006 // This condition extends compound variants.
3007 if ( i<istack )
3008 {
3009 if ( dlen%3 ) // frameshift
3010 {
3011 if ( ibeg==-1 ) ibeg = i;
3012 continue;
3013 }
3014 // the last base of the current variant vs the first base of the next
3015 // variant: are they in the same codon? (forward strand)
3016 int icur = node2sbeg(i);
3017 int inext = node2sbeg(i+1);
3018 if ( hap->stack[i].node->dlen > 0 ) icur += hap->stack[i].node->dlen;
3019 else if ( hap->stack[i].node->dlen < 0 ) icur++;
3020 if ( icur/3 == inext/3 ) // in the same codon, can't be flushed yet
3021 {
3022 if ( ibeg==-1 ) ibeg = i;
3023 continue;
3024 }
3025 }
3026 if ( ibeg<0 ) ibeg = i;
3027
3028 int ioff = node2soff(ibeg);
3029 int icur = node2sbeg(ibeg);
3030 int rbeg = node2rbeg(ibeg);
3031 int rend = node2rend(i);
3032 int fill = dlen%3;
3033
3034 // alt
3035 if ( hap->sseq.l )
3036 {
3037 sseq.l = hap->stack[i].slen - ioff;
3038 sseq.s = hap->sseq.s + ioff;
3039 }
3040 else // splice site overlap, see #1475227917
3041 sseq.l = fill = 0;
3042 cds_translate(&sref, &sseq, icur,rbeg,rend, tr->strand, &hap->tseq, fill);
3043
3044 // ref
3045 sseq.l = node2rend(i) - rbeg;
3046 sseq.s = sref.s + N_REF_PAD + rbeg;
3047 sseq.m = sref.m - 2*N_REF_PAD;
3048 cds_translate(&sref, &sseq, rbeg,rbeg,rend, tr->strand, &hap->tref, fill);
3049 sseq.m = sref.m - 2*N_REF_PAD + hap->stack[istack].dlen;
3050
3051 hap_add_csq(args,hap,node,0, ibeg,i,dlen,indel);
3052 ibeg = -1;
3053 dlen = 0;
3054 indel = 0;
3055 }
3056 }
3057 else
3058 {
3059 i = istack + 1, ibeg = -1;
3060 while ( --i > 0 )
3061 {
3062 assert ( hap->stack[i].node->type != HAP_SSS );
3063 dlen += hap->stack[i].node->dlen;
3064 if ( hap->stack[i].node->dlen ) indel = 1;
3065 if ( i>1 )
3066 {
3067 if ( dlen%3 )
3068 {
3069 if ( ibeg==-1 ) ibeg = i;
3070 continue;
3071 }
3072 // the last base of the current variant vs the first base of the next
3073 // variant: are they in the same codon? (reverse strand)
3074 int icur = sseq.m - 1 - node2sbeg(i);
3075 int inext = sseq.m - 1 - node2sbeg(i-1);
3076 if ( hap->stack[i].node->dlen > 0 ) icur += hap->stack[i].node->dlen - 1;
3077 else if ( hap->stack[i].node->dlen < 0 ) icur -= hap->stack[i].node->dlen;
3078 if ( hap->stack[i-1].node->dlen > 0 ) inext -= hap->stack[i-1].node->dlen;
3079 if ( icur/3 == inext/3 )
3080 {
3081 if ( ibeg==-1 ) ibeg = i;
3082 continue;
3083 }
3084 }
3085 if ( ibeg<0 ) ibeg = i;
3086 int ioff = node2soff(i);
3087 int icur = node2sbeg(i);
3088 int rbeg = node2rbeg(i);
3089 int rend = node2rend(ibeg);
3090 int fill = dlen%3;
3091
3092 // alt
3093 if ( hap->sseq.l )
3094 {
3095 sseq.l = hap->stack[ibeg].slen - ioff;
3096 sseq.s = hap->sseq.s + ioff;
3097 }
3098 else // splice site overlap, see #1475227917
3099 sseq.l = fill = 0;
3100 cds_translate(&sref, &sseq, icur,rbeg,rend, tr->strand, &hap->tseq, fill);
3101
3102 // ref
3103 sseq.l = node2rend(ibeg) - rbeg;
3104 sseq.s = sref.s + N_REF_PAD + rbeg;
3105 sseq.m = sref.m - 2*N_REF_PAD;
3106 cds_translate(&sref, &sseq, rbeg,rbeg,rend, tr->strand, &hap->tref, fill);
3107 sseq.m = sref.m - 2*N_REF_PAD + hap->stack[istack].dlen;
3108
3109 hap_add_csq(args,hap,node,sseq.m, i,ibeg,dlen,indel);
3110 ibeg = -1;
3111 dlen = 0;
3112 indel = 0;
3113 }
3114 }
3115 }
3116 }
3117
csq_print_text(args_t * args,csq_t * csq,int ismpl,int ihap)3118 static inline void csq_print_text(args_t *args, csq_t *csq, int ismpl, int ihap)
3119 {
3120 if ( csq->type.type & CSQ_PRINTED_UPSTREAM ) return;
3121
3122 char *smpl = ismpl >= 0 ? args->hdr->samples[ismpl] : "-";
3123 const char *chr = bcf_hdr_id2name(args->hdr,args->rid);
3124
3125 fprintf(args->out,"CSQ\t%s\t", smpl);
3126 if ( ihap>0 )
3127 fprintf(args->out,"%d", ihap);
3128 else
3129 fprintf(args->out,"-");
3130
3131 args->str.l = 0;
3132 kput_vcsq(args, &csq->type, &args->str);
3133 fprintf(args->out,"\t%s\t%d\t%s\n",chr,csq->pos+1,args->str.s);
3134 }
hap_print_text(args_t * args,tscript_t * tr,int ismpl,int ihap,hap_node_t * node)3135 static inline void hap_print_text(args_t *args, tscript_t *tr, int ismpl, int ihap, hap_node_t *node)
3136 {
3137 if ( !node || !node->ncsq_list ) return;
3138
3139 char *smpl = ismpl >= 0 ? args->hdr->samples[ismpl] : "-";
3140 const char *chr = bcf_hdr_id2name(args->hdr,args->rid);
3141
3142 int i;
3143 for (i=0; i<node->ncsq_list; i++)
3144 {
3145 csq_t *csq = node->csq_list + i;
3146 if ( csq->type.type & CSQ_PRINTED_UPSTREAM ) continue;
3147 assert( csq->type.vstr.l );
3148
3149 fprintf(args->out,"CSQ\t%s\t", smpl);
3150 if ( ihap>0 )
3151 fprintf(args->out,"%d", ihap);
3152 else
3153 fprintf(args->out,"-");
3154
3155 args->str.l = 0;
3156 kput_vcsq(args, &csq->type, &args->str);
3157 fprintf(args->out,"\t%s\t%d\t%s\n",chr,csq->pos+1,args->str.s);
3158 }
3159 }
3160
hap_stage_vcf(args_t * args,tscript_t * tr,int ismpl,int ihap,hap_node_t * node)3161 static inline void hap_stage_vcf(args_t *args, tscript_t *tr, int ismpl, int ihap, hap_node_t *node)
3162 {
3163 if ( !node || !node->ncsq_list || ismpl<0 ) return;
3164
3165 int i;
3166 for (i=0; i<node->ncsq_list; i++)
3167 {
3168 csq_t *csq = node->csq_list + i;
3169 vrec_t *vrec = csq->vrec;
3170 int icsq2 = 2*csq->idx + ihap;
3171 if ( icsq2 >= args->ncsq2_max ) // more than ncsq2_max consequences, so can't fit it in FMT
3172 {
3173 if ( args->verbosity && (!args->ncsq2_small_warned || args->verbosity > 1) )
3174 {
3175 fprintf(stderr,
3176 "Warning: Too many consequences for sample %s at %s:%"PRId64", keeping the first %d and skipping the rest.\n",
3177 args->hdr->samples[ismpl],bcf_hdr_id2name(args->hdr,args->rid),(int64_t) vrec->line->pos+1,csq->idx);
3178 if ( !args->ncsq2_small_warned )
3179 fprintf(stderr," The limit can be increased by setting the --ncsq parameter. This warning is printed only once.\n");
3180 }
3181 if ( args->ncsq2_small_warned < icsq2 ) args->ncsq2_small_warned = icsq2;
3182 break;
3183 }
3184 int ival, ibit;
3185 icsq2_to_bit(icsq2, &ival,&ibit);
3186 if ( vrec->nfmt < 1 + ival ) vrec->nfmt = 1 + ival;
3187 vrec->fmt_bm[ismpl*args->nfmt_bcsq + ival] |= 1 << ibit;
3188 }
3189 }
3190
hap_flush(args_t * args,uint32_t pos)3191 void hap_flush(args_t *args, uint32_t pos)
3192 {
3193 int i,j;
3194 tr_heap_t *heap = args->active_tr;
3195 while ( heap->ndat && heap->dat[0]->end<=pos )
3196 {
3197 tscript_t *tr = heap->dat[0];
3198 khp_delete(trhp, heap);
3199 args->hap->tr = tr;
3200 if ( tr->root && tr->root->nchild ) // normal, non-localized calling
3201 {
3202 hap_finalize(args, args->hap);
3203
3204 if ( args->output_type==FT_TAB_TEXT ) // plain text output, not a vcf
3205 {
3206 if ( args->phase==PHASE_DROP_GT )
3207 hap_print_text(args, tr, -1,0, tr->hap[0]);
3208 else
3209 {
3210 for (i=0; i<args->smpl->n; i++)
3211 {
3212 for (j=0; j<2; j++)
3213 hap_print_text(args, tr, args->smpl->idx[i],j+1, tr->hap[i*2+j]);
3214 }
3215 }
3216 }
3217 else if ( args->phase!=PHASE_DROP_GT )
3218 {
3219 for (i=0; i<args->smpl->n; i++)
3220 {
3221 for (j=0; j<2; j++)
3222 hap_stage_vcf(args, tr, args->smpl->idx[i],j, tr->hap[i*2+j]);
3223 }
3224 }
3225 }
3226
3227 // mark the transcript for deletion. Cannot delete it immediately because
3228 // by-position VCF output will need them when flushed by vcf_buf_push
3229 args->nrm_tr++;
3230 hts_expand(tscript_t*,args->nrm_tr,args->mrm_tr,args->rm_tr);
3231 args->rm_tr[args->nrm_tr-1] = tr;
3232 }
3233 }
3234
3235 #define SWAP(type_t, a, b) { type_t t = a; a = b; b = t; }
3236
vbuf_push(args_t * args,bcf1_t ** rec_ptr)3237 vbuf_t *vbuf_push(args_t *args, bcf1_t **rec_ptr)
3238 {
3239 int i;
3240
3241 assert(rec_ptr);
3242 bcf1_t *rec = *rec_ptr;
3243
3244 // check for duplicate records
3245 i = args->vcf_rbuf.n ? rbuf_last(&args->vcf_rbuf) : -1;
3246 if ( i<0 || args->vcf_buf[i]->vrec[0]->line->pos!=rec->pos )
3247 {
3248 // vcf record with a new pos
3249 rbuf_expand0(&args->vcf_rbuf, vbuf_t*, args->vcf_rbuf.n+1, args->vcf_buf);
3250 i = rbuf_append(&args->vcf_rbuf);
3251 if ( !args->vcf_buf[i] ) args->vcf_buf[i] = (vbuf_t*) calloc(1,sizeof(vbuf_t));
3252 args->vcf_buf[i]->n = 0;
3253 args->vcf_buf[i]->keep_until = 0;
3254 }
3255 vbuf_t *vbuf = args->vcf_buf[i];
3256 vbuf->n++;
3257 hts_expand0(vrec_t*, vbuf->n, vbuf->m, vbuf->vrec);
3258 if ( !vbuf->vrec[vbuf->n - 1] )
3259 vbuf->vrec[vbuf->n - 1] = (vrec_t*) calloc(1,sizeof(vrec_t));
3260
3261 vrec_t *vrec = vbuf->vrec[vbuf->n - 1];
3262 if ( args->phase!=PHASE_DROP_GT && args->smpl->n )
3263 {
3264 if ( !vrec->fmt_bm ) vrec->fmt_bm = (uint32_t*) calloc(args->hdr_nsmpl,sizeof(*vrec->fmt_bm) * args->nfmt_bcsq);
3265 else memset(vrec->fmt_bm,0,args->hdr_nsmpl*sizeof(*vrec->fmt_bm) * args->nfmt_bcsq);
3266 }
3267 if ( !vrec->line ) vrec->line = bcf_init1();
3268 SWAP(bcf1_t*, (*rec_ptr), vrec->line);
3269
3270 int ret;
3271 khint_t k = kh_put(pos2vbuf, args->pos2vbuf, (int)rec->pos, &ret);
3272 kh_val(args->pos2vbuf,k) = vbuf;
3273
3274 return vbuf;
3275 }
3276
vbuf_flush(args_t * args,uint32_t pos)3277 void vbuf_flush(args_t *args, uint32_t pos)
3278 {
3279 int i,j;
3280 while ( args->vcf_rbuf.n )
3281 {
3282 vbuf_t *vbuf;
3283 if ( !args->local_csq && args->active_tr->ndat )
3284 {
3285 // check if the first active transcript starts beyond the first buffered VCF record,
3286 // cannot output buffered VCF lines (args.vbuf) until the active transcripts are gone
3287 vbuf = args->vcf_buf[ args->vcf_rbuf.f ];
3288 if ( vbuf->keep_until > pos ) break;
3289 assert( vbuf->n );
3290 }
3291
3292 i = rbuf_shift(&args->vcf_rbuf);
3293 assert( i>=0 );
3294 vbuf = args->vcf_buf[i];
3295 int pos = vbuf->n ? vbuf->vrec[0]->line->pos : -1;
3296 for (i=0; i<vbuf->n; i++)
3297 {
3298 vrec_t *vrec = vbuf->vrec[i];
3299 if ( !args->out_fh ) // not a VCF output
3300 {
3301 vrec->nvcsq = 0;
3302 continue;
3303 }
3304 if ( !vrec->nvcsq )
3305 {
3306 if ( bcf_write(args->out_fh, args->hdr, vrec->line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname?args->output_fname:"standard output");
3307 int save_pos = vrec->line->pos;
3308 bcf_empty(vrec->line);
3309 vrec->line->pos = save_pos; // this is necessary for compound variants
3310 continue;
3311 }
3312
3313 args->str.l = 0;
3314 kput_vcsq(args, &vrec->vcsq[0], &args->str);
3315 for (j=1; j<vrec->nvcsq; j++)
3316 {
3317 kputc_(',', &args->str);
3318 kput_vcsq(args, &vrec->vcsq[j], &args->str);
3319 }
3320 bcf_update_info_string(args->hdr, vrec->line, args->bcsq_tag, args->str.s);
3321 if ( args->hdr_nsmpl )
3322 {
3323 if ( vrec->nfmt < args->nfmt_bcsq )
3324 for (j=1; j<args->hdr_nsmpl; j++)
3325 memmove(&vrec->fmt_bm[j*vrec->nfmt], &vrec->fmt_bm[j*args->nfmt_bcsq], vrec->nfmt*sizeof(*vrec->fmt_bm));
3326 bcf_update_format_int32(args->hdr, vrec->line, args->bcsq_tag, vrec->fmt_bm, args->hdr_nsmpl*vrec->nfmt);
3327 }
3328 vrec->nvcsq = 0;
3329 if ( bcf_write(args->out_fh, args->hdr, vrec->line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname?args->output_fname:"standard output");
3330 int save_pos = vrec->line->pos;
3331 bcf_empty(vrec->line);
3332 vrec->line->pos = save_pos;
3333 }
3334 if ( pos!=-1 )
3335 {
3336 khint_t k = kh_get(pos2vbuf, args->pos2vbuf, pos);
3337 if ( k != kh_end(args->pos2vbuf) ) kh_del(pos2vbuf, args->pos2vbuf, k);
3338 }
3339 vbuf->n = 0;
3340 }
3341 if ( args->active_tr->ndat ) return;
3342
3343 for (i=0; i<args->nrm_tr; i++)
3344 {
3345 tscript_t *tr = args->rm_tr[i];
3346 if ( tr->root ) hap_destroy(tr->root);
3347 tr->root = NULL;
3348 free(tr->hap);
3349 free(tr->ref);
3350 free(tr->sref);
3351 }
3352 args->nrm_tr = 0;
3353 args->ncsq_buf = 0;
3354 }
3355
tscript_init_ref(args_t * args,tscript_t * tr,const char * chr)3356 void tscript_init_ref(args_t *args, tscript_t *tr, const char *chr)
3357 {
3358 int i, len;
3359 int pad_beg = tr->beg >= N_REF_PAD ? N_REF_PAD : tr->beg;
3360
3361 tr->ref = faidx_fetch_seq(args->fai, chr, tr->beg - pad_beg, tr->end + N_REF_PAD, &len);
3362 if ( !tr->ref )
3363 error("faidx_fetch_seq failed %s:%d-%d\n", chr,tr->beg+1,tr->end+1);
3364
3365 int pad_end = len - (tr->end - tr->beg + 1 + pad_beg);
3366 if ( pad_beg + pad_end != 2*N_REF_PAD )
3367 {
3368 char *ref = (char*) malloc(tr->end - tr->beg + 1 + 2*N_REF_PAD + 1);
3369 for (i=0; i < N_REF_PAD - pad_beg; i++) ref[i] = 'N';
3370 memcpy(ref+i, tr->ref, len);
3371 len += i;
3372 for (i=0; i < N_REF_PAD - pad_end; i++) ref[i+len] = 'N';
3373 ref[i+len] = 0;
3374 free(tr->ref);
3375 tr->ref = ref;
3376 }
3377 }
3378
sanity_check_ref(args_t * args,tscript_t * tr,bcf1_t * rec)3379 static void sanity_check_ref(args_t *args, tscript_t *tr, bcf1_t *rec)
3380 {
3381 int vbeg = 0;
3382 int rbeg = rec->pos - tr->beg + N_REF_PAD;
3383 if ( rbeg < 0 ) { vbeg += abs(rbeg); rbeg = 0; }
3384 char *ref = tr->ref + rbeg;
3385 char *vcf = rec->d.allele[0] + vbeg;
3386 assert( vcf - rec->d.allele[0] < strlen(rec->d.allele[0]) && ref - tr->ref < tr->end - tr->beg + 2*N_REF_PAD );
3387 int i = 0;
3388 while ( ref[i] && vcf[i] )
3389 {
3390 if ( ref[i]!=vcf[i] && toupper(ref[i])!=toupper(vcf[i]) )
3391 error("Error: the fasta reference does not match the VCF REF allele at %s:%"PRId64" .. fasta=%c vcf=%c\n",
3392 bcf_seqname(args->hdr,rec),(int64_t) rec->pos+vbeg+1,ref[i],vcf[i]);
3393 i++;
3394 }
3395 }
3396
test_cds_local(args_t * args,bcf1_t * rec)3397 int test_cds_local(args_t *args, bcf1_t *rec)
3398 {
3399 int i,j, ret = 0;
3400 const char *chr = bcf_seqname(args->hdr,rec);
3401 // note that the off-by-one extension of rlen is deliberate to account for insertions
3402 if ( !regidx_overlap(args->idx_cds,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0;
3403
3404 // structures to fake the normal test_cds machinery
3405 hap_node_t root, node;
3406 root.type = HAP_ROOT;
3407 kstring_t *tref = &args->hap->tref, *tseq = &args->hap->tseq;
3408
3409 while ( regitr_overlap(args->itr) )
3410 {
3411 gf_cds_t *cds = regitr_payload(args->itr,gf_cds_t*);
3412 tscript_t *tr = cds->tr;
3413 if ( !GF_is_coding(tr->type) ) continue;
3414 ret = 1;
3415
3416 if ( !tr->ref )
3417 {
3418 tscript_init_ref(args, tr, chr);
3419 tscript_splice_ref(tr);
3420 khp_insert(trhp, args->active_tr, &tr); // only to clean the reference afterwards
3421 }
3422
3423 sanity_check_ref(args, tr, rec);
3424
3425 kstring_t sref;
3426 sref.s = tr->sref;
3427 sref.l = tr->nsref;
3428 sref.m = sref.l;
3429
3430 for (i=1; i<rec->n_allele; i++)
3431 {
3432 if ( rec->d.allele[i][0]=='<' || rec->d.allele[i][0]=='*' ) { continue; }
3433 if ( hap_init(args, &root, &node, cds, rec, i)!=0 ) continue;
3434
3435 csq_t csq;
3436 memset(&csq, 0, sizeof(csq_t));
3437 csq.pos = rec->pos;
3438 csq.type.biotype = tr->type;
3439 csq.type.strand = tr->strand;
3440 csq.type.trid = tr->id;
3441 csq.type.vcf_ial = i;
3442 csq.type.gene = tr->gene->name;
3443
3444 int csq_type = node.csq;
3445
3446 // code repetition: it would be nice to reuse the code from hap_add_csq, needs refactoring though
3447 if ( node.type == HAP_SSS )
3448 {
3449 csq.type.type = csq_type;
3450 csq_stage(args, &csq, rec);
3451 }
3452 else
3453 {
3454 kstring_t sseq;
3455 sseq.m = sref.m - 2*N_REF_PAD + node.dlen;
3456 sseq.s = node.seq;
3457 int alen = sseq.l = strlen(sseq.s);
3458 int fill = node.dlen%3 && alen ? 1 : 0; // see #1475227917
3459 cds_translate(&sref, &sseq, node.sbeg,node.sbeg,node.sbeg+node.rlen, tr->strand, tseq, fill);
3460
3461 sseq.m = sref.m - 2*N_REF_PAD;
3462 sseq.s = sref.s + N_REF_PAD + node.sbeg;
3463 sseq.l = node.rlen;
3464 cds_translate(&sref, &sseq, node.sbeg,node.sbeg,node.sbeg+node.rlen, tr->strand, tref, fill);
3465
3466 // check for truncating stops
3467 for (j=0; j<tref->l; j++)
3468 if ( tref->s[j]=='*' ) break;
3469 if ( j!=tref->l )
3470 {
3471 tref->l = j+1;
3472 tref->s[j+1] = 0;
3473 }
3474 for (j=0; j<tseq->l; j++)
3475 if ( tseq->s[j]=='*' ) break;
3476 if ( j!=tseq->l )
3477 {
3478 tseq->l = j+1;
3479 tseq->s[j+1] = 0;
3480 }
3481 if ( csq_type & CSQ_STOP_LOST )
3482 {
3483 if ( tref->s[tref->l-1]=='*' && tref->s[tref->l-1] == tseq->s[tseq->l-1] )
3484 {
3485 csq_type &= ~CSQ_STOP_LOST;
3486 csq_type |= CSQ_STOP_RETAINED;
3487 }
3488 else if (tref->s[tref->l-1]!='*' )
3489 {
3490 // This is CDS 3' incomplete ENSG00000173376/synon.vcf, can also be missense
3491 // We observe in real data a change to a stop, ENST00000528237/retained-stop-incomplete-cds.vcf
3492 if ( tseq->s[tseq->l-1] == '*' )
3493 {
3494 csq_type &= ~CSQ_STOP_GAINED;
3495 csq_type |= CSQ_STOP_RETAINED;
3496 }
3497 else
3498 csq_type |= CSQ_INCOMPLETE_CDS;
3499 }
3500 }
3501 if ( csq_type & CSQ_START_LOST && tref->s[0]!='M' )
3502 csq_type &= ~CSQ_START_LOST;
3503 if ( node.dlen!=0 )
3504 {
3505 if ( node.dlen%3 )
3506 csq_type |= CSQ_FRAMESHIFT_VARIANT;
3507 else if ( node.dlen<0 )
3508 csq_type |= CSQ_INFRAME_DELETION;
3509 else
3510 csq_type |= CSQ_INFRAME_INSERTION;
3511 if ( tref->s[tref->l-1]!='*' && tseq->s[tseq->l-1]=='*' )
3512 csq_type |= CSQ_STOP_GAINED;
3513 }
3514 else
3515 {
3516 for (j=0; j<tref->l; j++)
3517 if ( tref->s[j] != tseq->s[j] ) break;
3518 if ( j==tref->l )
3519 csq_type |= CSQ_SYNONYMOUS_VARIANT;
3520 else if ( tref->s[j] == '*' )
3521 csq_type |= CSQ_STOP_LOST;
3522 else if ( tseq->s[j] == '*' )
3523 csq_type |= CSQ_STOP_GAINED;
3524 else
3525 csq_type |= CSQ_MISSENSE_VARIANT;
3526 }
3527 if ( csq_type & CSQ_COMPOUND )
3528 {
3529 // create the aa variant string
3530 kstring_t str = {0,0,0};
3531 int aa_rbeg = tr->strand==STRAND_FWD ? node.sbeg/3+1 : (tr->nsref - 2*N_REF_PAD - node.sbeg - node.rlen)/3+1;
3532 int aa_sbeg = tr->strand==STRAND_FWD ? node.sbeg/3+1 : (tr->nsref - 2*N_REF_PAD + node.dlen - node.sbeg - alen)/3+1;
3533 kputc_('|', &str);
3534 kputw(aa_rbeg, &str);
3535 kprint_aa_prediction(args,aa_rbeg,tref,&str);
3536 if ( !(csq_type & CSQ_SYNONYMOUS_VARIANT) )
3537 {
3538 kputc_('>', &str);
3539 kputw(aa_sbeg, &str);
3540 kprint_aa_prediction(args,aa_sbeg,tseq,&str);
3541 }
3542 kputc_('|', &str);
3543 kputw(rec->pos+1, &str);
3544 kputs(node.var, &str);
3545 csq.type.vstr = str;
3546 csq.type.type = csq_type & CSQ_COMPOUND;
3547 csq_stage(args, &csq, rec);
3548
3549 // all this only to clean vstr when vrec is flushed
3550 if ( !tr->root )
3551 tr->root = (hap_node_t*) calloc(1,sizeof(hap_node_t));
3552 tr->root->ncsq_list++;
3553 hts_expand0(csq_t,tr->root->ncsq_list,tr->root->mcsq_list,tr->root->csq_list);
3554 csq_t *rm_csq = tr->root->csq_list + tr->root->ncsq_list - 1;
3555 rm_csq->type.vstr = str;
3556 }
3557 if ( csq_type & ~CSQ_COMPOUND )
3558 {
3559 csq.type.type = csq_type & ~CSQ_COMPOUND;
3560 csq.type.vstr.l = 0;
3561 csq_stage(args, &csq, rec);
3562 }
3563 }
3564 free(node.seq);
3565 free(node.var);
3566 }
3567 }
3568 return ret;
3569 }
3570
test_cds(args_t * args,bcf1_t * rec,vbuf_t * vbuf)3571 int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf)
3572 {
3573 static int overlaps_warned = 0, multiploid_warned = 0;
3574
3575 int i, ret = 0, hap_ret;
3576 const char *chr = bcf_seqname(args->hdr,rec);
3577 // note that the off-by-one extension of rlen is deliberate to account for insertions
3578 if ( !regidx_overlap(args->idx_cds,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0;
3579 while ( regitr_overlap(args->itr) )
3580 {
3581 gf_cds_t *cds = regitr_payload(args->itr,gf_cds_t*);
3582 tscript_t *tr = cds->tr;
3583 if ( !GF_is_coding(tr->type) ) continue;
3584 if ( vbuf->keep_until < tr->end ) vbuf->keep_until = tr->end;
3585 ret = 1;
3586 if ( !tr->root )
3587 {
3588 // initialize the transcript and its haplotype tree, fetch the reference sequence
3589 tscript_init_ref(args, tr, chr);
3590
3591 tr->root = (hap_node_t*) calloc(1,sizeof(hap_node_t));
3592 tr->nhap = args->phase==PHASE_DROP_GT ? 1 : 2*args->smpl->n; // maximum ploidy = diploid
3593 tr->hap = (hap_node_t**) malloc(tr->nhap*sizeof(hap_node_t*));
3594 for (i=0; i<tr->nhap; i++) tr->hap[i] = NULL;
3595 tr->root->nend = tr->nhap;
3596 tr->root->type = HAP_ROOT;
3597
3598 khp_insert(trhp, args->active_tr, &tr);
3599 }
3600
3601 sanity_check_ref(args, tr, rec);
3602
3603 if ( args->phase==PHASE_DROP_GT )
3604 {
3605 if ( rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*' ) { continue; }
3606 hap_node_t *parent = tr->hap[0] ? tr->hap[0] : tr->root;
3607 hap_node_t *child = (hap_node_t*)calloc(1,sizeof(hap_node_t));
3608 hap_ret = hap_init(args, parent, child, cds, rec, 1);
3609 if ( hap_ret!=0 )
3610 {
3611 // overlapping or intron variant, cannot apply
3612 if ( hap_ret==1 )
3613 {
3614 if ( args->verbosity && (!overlaps_warned || args->verbosity > 1) )
3615 {
3616 fprintf(stderr,
3617 "Warning: Skipping overlapping variants at %s:%"PRId64"\t%s>%s.\n",
3618 chr,(int64_t) rec->pos+1,rec->d.allele[0],rec->d.allele[1]);
3619 if ( !overlaps_warned )
3620 fprintf(stderr," This message is printed only once, the verbosity can be increased with `--verbose 2`\n");
3621 overlaps_warned = 1;
3622 }
3623 if ( args->out )
3624 fprintf(args->out,"LOG\tWarning: Skipping overlapping variants at %s:%"PRId64"\t%s>%s\n", chr,(int64_t) rec->pos+1,rec->d.allele[0],rec->d.allele[1]);
3625 }
3626 else ret = 1; // prevent reporting as intron in test_tscript
3627 hap_destroy(child);
3628 continue;
3629 }
3630 if ( child->type==HAP_SSS )
3631 {
3632 csq_t csq;
3633 memset(&csq, 0, sizeof(csq_t));
3634 csq.pos = rec->pos;
3635 csq.type.biotype = tr->type;
3636 csq.type.strand = tr->strand;
3637 csq.type.trid = tr->id;
3638 csq.type.vcf_ial = 1;
3639 csq.type.gene = tr->gene->name;
3640 csq.type.type = child->csq;
3641 csq_stage(args, &csq, rec);
3642 hap_destroy(child);
3643 ret = 1;
3644 continue;
3645 }
3646 parent->nend--;
3647 parent->nchild = 1;
3648 parent->mchild = 1;
3649 parent->child = (hap_node_t**) malloc(sizeof(hap_node_t*));
3650 parent->child[0] = child;
3651 tr->hap[0] = child;
3652 tr->hap[0]->nend = 1;
3653 continue;
3654 }
3655
3656 // apply the VCF variants and extend the haplotype tree
3657 int j, ismpl, ihap, ngts = bcf_get_genotypes(args->hdr, rec, &args->gt_arr, &args->mgt_arr);
3658 ngts /= bcf_hdr_nsamples(args->hdr);
3659 if ( ngts!=1 && ngts!=2 )
3660 {
3661 if ( args->verbosity && (!multiploid_warned || args->verbosity > 1) )
3662 {
3663 fprintf(stderr,
3664 "Warning: Skipping site with non-diploid/non-haploid genotypes at %s:%"PRId64"\t%s>%s.\n",
3665 chr,(int64_t) rec->pos+1,rec->d.allele[0],rec->d.allele[1]);
3666 if ( !multiploid_warned )
3667 fprintf(stderr," This message is printed only once, the verbosity can be increased with `--verbose 2`\n");
3668 multiploid_warned = 1;
3669 }
3670 if ( args->out )
3671 fprintf(args->out,"LOG\tWarning: Skipping site with non-diploid/non-haploid genotypes at %s:%"PRId64"\t%s>%s\n", chr,(int64_t) rec->pos+1,rec->d.allele[0],rec->d.allele[1]);
3672 continue;
3673 }
3674 for (ismpl=0; ismpl<args->smpl->n; ismpl++)
3675 {
3676 int32_t *gt = args->gt_arr + args->smpl->idx[ismpl]*ngts;
3677 if ( gt[0]==bcf_gt_missing ) continue;
3678
3679 if ( ngts>1 && gt[1]!=bcf_gt_missing && gt[1]!=bcf_int32_vector_end && bcf_gt_allele(gt[0])!=bcf_gt_allele(gt[1]) )
3680 {
3681 if ( args->phase==PHASE_MERGE )
3682 {
3683 if ( !bcf_gt_allele(gt[0]) ) gt[0] = gt[1];
3684 }
3685 if ( !bcf_gt_is_phased(gt[0]) && !bcf_gt_is_phased(gt[1]) )
3686 {
3687 if ( args->phase==PHASE_REQUIRE )
3688 error("Unphased heterozygous genotype at %s:%"PRId64", sample %s. See the --phase option.\n", chr,(int64_t) rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]]);
3689 if ( args->phase==PHASE_SKIP )
3690 continue;
3691 if ( args->phase==PHASE_NON_REF )
3692 {
3693 if ( !bcf_gt_allele(gt[0]) ) gt[0] = gt[1];
3694 else if ( !bcf_gt_allele(gt[1]) ) gt[1] = gt[0];
3695 }
3696 }
3697 }
3698
3699 for (ihap=0; ihap<ngts; ihap++)
3700 {
3701 if ( gt[ihap]==bcf_gt_missing || gt[ihap]==bcf_int32_vector_end ) continue;
3702
3703 i = 2*ismpl + ihap;
3704
3705 int ial = bcf_gt_allele(gt[ihap]);
3706 if ( !ial ) continue;
3707 assert( ial < rec->n_allele );
3708 if ( rec->d.allele[ial][0]=='<' || rec->d.allele[ial][0]=='*' ) { continue; }
3709
3710 hap_node_t *parent = tr->hap[i] ? tr->hap[i] : tr->root;
3711 if ( parent->cur_rec==rec && parent->cur_child[ial]>=0 )
3712 {
3713 // this haplotype has been seen in another sample
3714 tr->hap[i] = parent->child[ parent->cur_child[ial] ];
3715 tr->hap[i]->nend++;
3716 parent->nend--;
3717 continue;
3718 }
3719
3720 hap_node_t *child = (hap_node_t*)calloc(1,sizeof(hap_node_t));
3721 hap_ret = hap_init(args, parent, child, cds, rec, ial);
3722 if ( hap_ret!=0 )
3723 {
3724 // overlapping or intron variant, cannot apply
3725 if ( hap_ret==1 )
3726 {
3727 if ( args->verbosity && (!overlaps_warned || args->verbosity > 1) )
3728 {
3729 fprintf(stderr,
3730 "Warning: Skipping overlapping variants at %s:%"PRId64", sample %s\t%s>%s.\n",
3731 chr,(int64_t) rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]],rec->d.allele[0],rec->d.allele[ial]);
3732 if ( !overlaps_warned )
3733 fprintf(stderr," This message is printed only once, the verbosity can be increased with `--verbose 2`\n");
3734 overlaps_warned = 1;
3735 }
3736 if ( args->out )
3737 fprintf(args->out,"LOG\tWarning: Skipping overlapping variants at %s:%"PRId64", sample %s\t%s>%s\n",
3738 chr,(int64_t) rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]],rec->d.allele[0],rec->d.allele[ial]);
3739 }
3740 hap_destroy(child);
3741 continue;
3742 }
3743 if ( child->type==HAP_SSS )
3744 {
3745 csq_t csq;
3746 memset(&csq, 0, sizeof(csq_t));
3747 csq.pos = rec->pos;
3748 csq.type.biotype = tr->type;
3749 csq.type.strand = tr->strand;
3750 csq.type.trid = tr->id;
3751 csq.type.vcf_ial = ial;
3752 csq.type.gene = tr->gene->name;
3753 csq.type.type = child->csq;
3754 csq_stage(args, &csq, rec);
3755 hap_destroy(child);
3756 continue;
3757 }
3758 if ( parent->cur_rec!=rec )
3759 {
3760 hts_expand(int,rec->n_allele,parent->mcur_child,parent->cur_child);
3761 for (j=0; j<rec->n_allele; j++) parent->cur_child[j] = -1;
3762 parent->cur_rec = rec;
3763 }
3764
3765 j = parent->nchild++;
3766 hts_expand0(hap_node_t*,parent->nchild,parent->mchild,parent->child);
3767 parent->cur_child[ial] = j;
3768 parent->child[j] = child;
3769 tr->hap[i] = child;
3770 tr->hap[i]->nend++;
3771 parent->nend--;
3772 }
3773 }
3774 }
3775 return ret;
3776 }
3777
csq_stage(args_t * args,csq_t * csq,bcf1_t * rec)3778 void csq_stage(args_t *args, csq_t *csq, bcf1_t *rec)
3779 {
3780 // known issues: tab output leads to unsorted output. This is because
3781 // coding haplotypes are printed in one go and buffering is not used
3782 // with tab output. VCF output is OK though.
3783 if ( csq_push(args, csq, rec)!=0 && args->phase==PHASE_DROP_GT ) return; // the consequence already exists
3784
3785 int i,j,ngt = 0;
3786 if ( args->phase!=PHASE_DROP_GT )
3787 {
3788 ngt = bcf_get_genotypes(args->hdr, rec, &args->gt_arr, &args->mgt_arr);
3789 if ( ngt>0 ) ngt /= bcf_hdr_nsamples(args->hdr);
3790 }
3791 if ( ngt<=0 )
3792 {
3793 if ( args->output_type==FT_TAB_TEXT )
3794 csq_print_text(args, csq, -1,0);
3795 return;
3796 }
3797 assert( ngt<=2 );
3798
3799 if ( args->output_type==FT_TAB_TEXT )
3800 {
3801 for (i=0; i<args->smpl->n; i++)
3802 {
3803 int32_t *gt = args->gt_arr + args->smpl->idx[i]*ngt;
3804 for (j=0; j<ngt; j++)
3805 {
3806 if ( gt[j]==bcf_gt_missing || gt[j]==bcf_int32_vector_end ) continue;
3807 int ial = bcf_gt_allele(gt[j]);
3808 if ( !ial || ial!=csq->type.vcf_ial ) continue;
3809 csq_print_text(args, csq, args->smpl->idx[i],j+1);
3810 }
3811 }
3812 return;
3813 }
3814
3815 vrec_t *vrec = csq->vrec;
3816 for (i=0; i<args->smpl->n; i++)
3817 {
3818 int32_t *gt = args->gt_arr + args->smpl->idx[i]*ngt;
3819 for (j=0; j<ngt; j++)
3820 {
3821 if ( gt[j]==bcf_gt_missing || gt[j]==bcf_int32_vector_end ) continue;
3822 int ial = bcf_gt_allele(gt[j]);
3823 if ( !ial || ial!=csq->type.vcf_ial ) continue;
3824
3825 int icsq2 = 2*csq->idx + j;
3826 if ( icsq2 >= args->ncsq2_max ) // more than ncsq_max consequences, so can't fit it in FMT
3827 {
3828 int ismpl = args->smpl->idx[i];
3829 if ( args->verbosity && (!args->ncsq2_small_warned || args->verbosity > 1) )
3830 {
3831 fprintf(stderr,
3832 "Warning: Too many consequences for sample %s at %s:%"PRId64", keeping the first %d and skipping the rest.\n",
3833 args->hdr->samples[ismpl],bcf_hdr_id2name(args->hdr,args->rid),(int64_t) vrec->line->pos+1,icsq2+1);
3834 if ( !args->ncsq2_small_warned )
3835 fprintf(stderr," The limit can be increased by setting the --ncsq parameter. This warning is printed only once.\n");
3836 args->ncsq2_small_warned = 1;
3837 }
3838 if ( args->ncsq2_small_warned < icsq2 ) args->ncsq2_small_warned = icsq2;
3839 break;
3840 }
3841 int ival, ibit;
3842 icsq2_to_bit(icsq2, &ival,&ibit);
3843 if ( vrec->nfmt < 1 + ival ) vrec->nfmt = 1 + ival;
3844 vrec->fmt_bm[i*args->nfmt_bcsq + ival] |= 1 << ibit;
3845 }
3846 }
3847 }
test_utr(args_t * args,bcf1_t * rec)3848 int test_utr(args_t *args, bcf1_t *rec)
3849 {
3850 const char *chr = bcf_seqname(args->hdr,rec);
3851 // note that the off-by-one extension of rlen is deliberate to account for insertions
3852 if ( !regidx_overlap(args->idx_utr,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0;
3853
3854 splice_t splice;
3855 splice_init(&splice, rec);
3856
3857 int i, ret = 0;
3858 while ( regitr_overlap(args->itr) )
3859 {
3860 gf_utr_t *utr = regitr_payload(args->itr, gf_utr_t*);
3861 tscript_t *tr = splice.tr = utr->tr;
3862 for (i=1; i<rec->n_allele; i++)
3863 {
3864 if ( rec->d.allele[i][0]=='<' || rec->d.allele[i][0]=='*' ) { continue; }
3865 splice.vcf.alt = rec->d.allele[i];
3866 splice.csq = 0;
3867 int splice_ret = splice_csq(args, &splice, utr->beg, utr->end);
3868 if ( splice_ret!=SPLICE_INSIDE && splice_ret!=SPLICE_OVERLAP ) continue;
3869 csq_t csq;
3870 memset(&csq, 0, sizeof(csq_t));
3871 csq.pos = rec->pos;
3872 csq.type.type = utr->which==prime5 ? CSQ_UTR5 : CSQ_UTR3;
3873 csq.type.biotype = tr->type;
3874 csq.type.strand = tr->strand;
3875 csq.type.trid = tr->id;
3876 csq.type.vcf_ial = i;
3877 csq.type.gene = tr->gene->name;
3878 csq_stage(args, &csq, rec);
3879 ret = 1;
3880 }
3881 }
3882 assert(!splice.kref.s);
3883 assert(!splice.kalt.s);
3884 return ret;
3885 }
test_splice(args_t * args,bcf1_t * rec)3886 int test_splice(args_t *args, bcf1_t *rec)
3887 {
3888 const char *chr = bcf_seqname(args->hdr,rec);
3889 if ( !regidx_overlap(args->idx_exon,chr,rec->pos,rec->pos + rec->rlen, args->itr) ) return 0;
3890
3891 splice_t splice;
3892 splice_init(&splice, rec);
3893 splice.check_acceptor = splice.check_donor = 1;
3894
3895 int i, ret = 0;
3896 while ( regitr_overlap(args->itr) )
3897 {
3898 gf_exon_t *exon = regitr_payload(args->itr, gf_exon_t*);
3899 splice.tr = exon->tr;
3900 if ( !splice.tr->ncds ) continue; // not a coding transcript, no interest in splice sites
3901
3902 splice.check_region_beg = splice.tr->beg==exon->beg ? 0 : 1;
3903 splice.check_region_end = splice.tr->end==exon->end ? 0 : 1;
3904
3905 for (i=1; i<rec->n_allele; i++)
3906 {
3907 if ( rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*' ) { continue; }
3908 splice.vcf.alt = rec->d.allele[i];
3909 splice.csq = 0;
3910 splice_csq(args, &splice, exon->beg, exon->end);
3911 if ( splice.csq ) ret = 1;
3912 }
3913 }
3914 free(splice.kref.s);
3915 free(splice.kalt.s);
3916 return ret;
3917 }
test_tscript(args_t * args,bcf1_t * rec)3918 int test_tscript(args_t *args, bcf1_t *rec)
3919 {
3920 const char *chr = bcf_seqname(args->hdr,rec);
3921 if ( !regidx_overlap(args->idx_tscript,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0;
3922
3923 splice_t splice;
3924 splice_init(&splice, rec);
3925
3926 int i, ret = 0;
3927 while ( regitr_overlap(args->itr) )
3928 {
3929 tscript_t *tr = splice.tr = regitr_payload(args->itr, tscript_t*);
3930 for (i=1; i<rec->n_allele; i++)
3931 {
3932 if ( rec->d.allele[i][0]=='<' || rec->d.allele[i][0]=='*' ) { continue; }
3933 splice.vcf.alt = rec->d.allele[i];
3934 splice.csq = 0;
3935 int splice_ret = splice_csq(args, &splice, tr->beg, tr->end);
3936 if ( splice_ret!=SPLICE_INSIDE && splice_ret!=SPLICE_OVERLAP ) continue; // SPLICE_OUTSIDE or SPLICE_REF
3937 csq_t csq;
3938 memset(&csq, 0, sizeof(csq_t));
3939 csq.pos = rec->pos;
3940 csq.type.type = GF_is_coding(tr->type) ? CSQ_INTRON : CSQ_NON_CODING;
3941 csq.type.biotype = tr->type;
3942 csq.type.strand = tr->strand;
3943 csq.type.trid = tr->id;
3944 csq.type.gene = tr->gene->name;
3945 csq_stage(args, &csq, rec);
3946 ret = 1;
3947 }
3948 }
3949 assert(!splice.kref.s);
3950 assert(!splice.kalt.s);
3951 return ret;
3952 }
3953
test_symbolic_alt(args_t * args,bcf1_t * rec)3954 void test_symbolic_alt(args_t *args, bcf1_t *rec)
3955 {
3956 static int warned = 0;
3957 if ( args->verbosity && (!warned && args->verbosity > 0) )
3958 {
3959 fprintf(stderr,"Warning: The support for symbolic ALT insertions is experimental.\n");
3960 warned = 1;
3961 }
3962
3963 const char *chr = bcf_seqname(args->hdr,rec);
3964
3965 // only insertions atm
3966 int beg = rec->pos + 1;
3967 int end = beg;
3968 int csq_class = CSQ_ELONGATION;
3969
3970 int hit = 0;
3971 if ( regidx_overlap(args->idx_cds,chr,beg,end, args->itr) )
3972 {
3973 while ( regitr_overlap(args->itr) )
3974 {
3975 csq_t csq;
3976 memset(&csq, 0, sizeof(csq_t));
3977 gf_cds_t *cds = regitr_payload(args->itr,gf_cds_t*);
3978 tscript_t *tr = cds->tr;
3979 csq.type.type = (GF_is_coding(tr->type) ? CSQ_CODING_SEQUENCE : CSQ_NON_CODING) | csq_class;
3980 csq.pos = rec->pos;
3981 csq.type.biotype = tr->type;
3982 csq.type.strand = tr->strand;
3983 csq.type.trid = tr->id;
3984 csq.type.gene = tr->gene->name;
3985 csq_stage(args, &csq, rec);
3986 hit = 1;
3987 }
3988 }
3989 if ( regidx_overlap(args->idx_utr,chr,beg,end, args->itr) )
3990 {
3991 while ( regitr_overlap(args->itr) )
3992 {
3993 csq_t csq;
3994 memset(&csq, 0, sizeof(csq_t));
3995 gf_utr_t *utr = regitr_payload(args->itr, gf_utr_t*);
3996 tscript_t *tr = utr->tr;
3997 csq.type.type = (utr->which==prime5 ? CSQ_UTR5 : CSQ_UTR3) | csq_class;
3998 csq.pos = rec->pos;
3999 csq.type.biotype = tr->type;
4000 csq.type.strand = tr->strand;
4001 csq.type.trid = tr->id;
4002 csq.type.gene = tr->gene->name;
4003 csq_stage(args, &csq, rec);
4004 hit = 1;
4005 }
4006 }
4007 if ( regidx_overlap(args->idx_exon,chr,beg,end, args->itr) )
4008 {
4009 splice_t splice;
4010 splice_init(&splice, rec);
4011 splice.check_acceptor = splice.check_donor = 1;
4012
4013 while ( regitr_overlap(args->itr) )
4014 {
4015 gf_exon_t *exon = regitr_payload(args->itr, gf_exon_t*);
4016 splice.tr = exon->tr;
4017 if ( !splice.tr->ncds ) continue; // not a coding transcript, no interest in splice sites
4018 splice.check_region_beg = splice.tr->beg==exon->beg ? 0 : 1;
4019 splice.check_region_end = splice.tr->end==exon->end ? 0 : 1;
4020 splice.vcf.alt = rec->d.allele[1];
4021 splice.csq = csq_class;
4022 splice_csq(args, &splice, exon->beg, exon->end);
4023 if ( splice.csq ) hit = 1;
4024 }
4025 }
4026 if ( !hit && regidx_overlap(args->idx_tscript,chr,beg,end, args->itr) )
4027 {
4028 splice_t splice;
4029 splice_init(&splice, rec);
4030
4031 while ( regitr_overlap(args->itr) )
4032 {
4033 csq_t csq;
4034 memset(&csq, 0, sizeof(csq_t));
4035 tscript_t *tr = splice.tr = regitr_payload(args->itr, tscript_t*);
4036 splice.vcf.alt = rec->d.allele[1];
4037 splice.csq = csq_class;
4038 int splice_ret = splice_csq(args, &splice, tr->beg, tr->end);
4039 if ( splice_ret!=SPLICE_INSIDE && splice_ret!=SPLICE_OVERLAP ) continue; // SPLICE_OUTSIDE or SPLICE_REF
4040 csq.type.type = (GF_is_coding(tr->type) ? CSQ_INTRON : CSQ_NON_CODING) | csq_class;
4041 csq.pos = rec->pos;
4042 csq.type.biotype = tr->type;
4043 csq.type.strand = tr->strand;
4044 csq.type.trid = tr->id;
4045 csq.type.gene = tr->gene->name;
4046 csq_stage(args, &csq, rec);
4047 }
4048 }
4049 }
4050
debug_print_buffers(args_t * args,int pos)4051 void debug_print_buffers(args_t *args, int pos)
4052 {
4053 int i,j;
4054 fprintf(stderr,"debug_print_buffers at %d\n", pos);
4055 fprintf(stderr,"vbufs:\n");
4056 for (i=0; i<args->vcf_rbuf.n; i++)
4057 {
4058 int k = rbuf_kth(&args->vcf_rbuf, i);
4059 vbuf_t *vbuf = args->vcf_buf[k];
4060
4061 fprintf(stderr,"\tvbuf %d:\n", i);
4062 for (j=0; j<vbuf->n; j++)
4063 {
4064 vrec_t *vrec = vbuf->vrec[j];
4065 fprintf(stderr,"\t\t%"PRId64" .. nvcsq=%d\n", (int64_t) vrec->line->pos+1, vrec->nvcsq);
4066 }
4067 }
4068 fprintf(stderr,"pos2vbuf:");
4069 khint_t k;
4070 for (k = 0; k < kh_end(args->pos2vbuf); ++k)
4071 if (kh_exist(args->pos2vbuf, k)) fprintf(stderr," %d",1+(int)kh_key(args->pos2vbuf, k));
4072 fprintf(stderr,"\n");
4073 fprintf(stderr,"active_tr: %d\n", args->active_tr->ndat);
4074 }
4075
process(args_t * args,bcf1_t ** rec_ptr)4076 static void process(args_t *args, bcf1_t **rec_ptr)
4077 {
4078 if ( !rec_ptr )
4079 {
4080 hap_flush(args, REGIDX_MAX);
4081 vbuf_flush(args, REGIDX_MAX);
4082 return;
4083 }
4084
4085 bcf1_t *rec = *rec_ptr;
4086 static int32_t prev_rid = -1, prev_pos = -1;
4087 if ( prev_rid!=rec->rid )
4088 {
4089 prev_rid = rec->rid;
4090 prev_pos = rec->pos;
4091
4092 // Common error is to use different naming conventions in the fasta and the VCF (e.g. X vs chrX).
4093 // Perform a simple sanity check (that does not catch much), the chromosome must be present in the
4094 // reference file
4095 if ( !faidx_has_seq(args->fai,bcf_seqname(args->hdr,rec)) )
4096 error("Error: the chromosome \"%s\" is not present in %s\n",bcf_seqname(args->hdr,rec),args->fa_fname);
4097 }
4098 if ( prev_pos > rec->pos )
4099 error("Error: The file is not sorted, %s:%d comes before %s:%"PRId64"\n",bcf_seqname(args->hdr,rec),prev_pos+1,bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
4100
4101 int call_csq = 1;
4102 if ( rec->n_allele < 2 ) call_csq = 0; // no alternate allele
4103 else if ( rec->n_allele==2 && (rec->d.allele[1][0]=='*' || rec->d.allele[1][1]=='*') ) call_csq = 0; // gVCF, not an alt allele
4104 else if ( rec->d.allele[1][0]=='<' )
4105 {
4106 if ( strncmp("<INS",rec->d.allele[1], 4) ) call_csq = 0; // only <INS[:.*]> is supported at the moment
4107 }
4108 if ( call_csq && args->filter )
4109 {
4110 call_csq = filter_test(args->filter, rec, NULL);
4111 if ( args->filter_logic==FLT_EXCLUDE ) call_csq = call_csq ? 0 : 1;
4112 }
4113 if ( !call_csq )
4114 {
4115 if ( !args->out_fh ) return; // not a VCF output
4116 vbuf_push(args, rec_ptr);
4117 hap_flush(args, rec->pos-1);
4118 vbuf_flush(args, rec->pos-1);
4119 return;
4120 }
4121
4122 if ( args->rid != rec->rid )
4123 {
4124 hap_flush(args, REGIDX_MAX);
4125 vbuf_flush(args, REGIDX_MAX);
4126 }
4127 args->rid = rec->rid;
4128 vbuf_t *vbuf = vbuf_push(args, rec_ptr);
4129
4130 if ( rec->d.allele[1][0]!='<' )
4131 {
4132 int hit = args->local_csq ? test_cds_local(args, rec) : test_cds(args, rec, vbuf);
4133 hit += test_utr(args, rec);
4134 hit += test_splice(args, rec);
4135 if ( !hit ) test_tscript(args, rec);
4136 }
4137 else
4138 test_symbolic_alt(args, rec);
4139
4140 if ( rec->pos > 0 )
4141 {
4142 hap_flush(args, rec->pos-1);
4143 vbuf_flush(args, rec->pos-1);
4144 }
4145
4146 return;
4147 }
4148
usage(void)4149 static const char *usage(void)
4150 {
4151 return
4152 "\n"
4153 "About: Haplotype-aware consequence caller.\n"
4154 "Usage: bcftools csq [OPTIONS] in.vcf\n"
4155 "\n"
4156 "Required options:\n"
4157 " -f, --fasta-ref FILE Reference file in fasta format\n"
4158 " -g, --gff-annot FILE GFF3 annotation file\n"
4159 "\n"
4160 "CSQ options:\n"
4161 " -B, --trim-protein-seq INT Abbreviate protein-changing predictions to max INT aminoacids\n"
4162 " -c, --custom-tag STRING Use this tag instead of the default BCSQ\n"
4163 " -l, --local-csq Localized predictions, consider only one VCF record at a time\n"
4164 " -n, --ncsq INT Maximum number of per-haplotype consequences to consider for each site [15]\n"
4165 " -p, --phase a|m|r|R|s How to handle unphased heterozygous genotypes: [r]\n"
4166 " a: take GTs as is, create haplotypes regardless of phase (0/1 -> 0|1)\n"
4167 " m: merge *all* GTs into a single haplotype (0/1 -> 1, 1/2 -> 1)\n"
4168 " r: require phased GTs, throw an error on unphased het GTs\n"
4169 " R: create non-reference haplotypes if possible (0/1 -> 1|1, 1/2 -> 1|2)\n"
4170 " s: skip unphased hets\n"
4171 "Options:\n"
4172 " -e, --exclude EXPR Exclude sites for which the expression is true\n"
4173 " --force Run even if some sanity checks fail\n"
4174 " -i, --include EXPR Select sites for which the expression is true\n"
4175 " --no-version Do not append version and command line to the header\n"
4176 " -o, --output FILE Write output to a file [standard output]\n"
4177 " -O, --output-type b|u|z|v|t[0-9] b: compressed BCF, u: uncompressed BCF, z: compressed VCF\n"
4178 " v: uncompressed VCF, t: plain tab-delimited text output, 0-9: compression level [v]\n"
4179 " -r, --regions REGION Restrict to comma-separated list of regions\n"
4180 " -R, --regions-file FILE Restrict to regions listed in a file\n"
4181 " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n"
4182 " -s, --samples -|LIST Samples to include or \"-\" to apply all variants and ignore samples\n"
4183 " -S, --samples-file FILE Samples to include\n"
4184 " -t, --targets REGION Similar to -r but streams rather than index-jumps\n"
4185 " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n"
4186 " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"
4187 " --threads INT Use multithreading with <int> worker threads [0]\n"
4188 " -v, --verbose INT Verbosity level 0-2 [1]\n"
4189 "\n"
4190 "Example:\n"
4191 " bcftools csq -f hs37d5.fa -g Homo_sapiens.GRCh37.82.gff3.gz in.vcf\n"
4192 "\n"
4193 " # GFF3 annotation files can be downloaded from Ensembl. e.g. for human:\n"
4194 " ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens/\n"
4195 " ftp://ftp.ensembl.org/pub/grch37/release-84/gff3/homo_sapiens/\n"
4196 "\n";
4197 }
4198
main_csq(int argc,char * argv[])4199 int main_csq(int argc, char *argv[])
4200 {
4201 args_t *args = (args_t*) calloc(1,sizeof(args_t));
4202 args->argc = argc; args->argv = argv;
4203 args->output_type = FT_VCF;
4204 args->bcsq_tag = "BCSQ";
4205 args->ncsq2_max = 2*(16-1); // 1 bit is reserved for BCF missing values
4206 args->verbosity = 1;
4207 args->record_cmd_line = 1;
4208 args->clevel = -1;
4209
4210 static struct option loptions[] =
4211 {
4212 {"force",0,0,1},
4213 {"threads",required_argument,NULL,2},
4214 {"help",0,0,'h'},
4215 {"ncsq",1,0,'n'},
4216 {"brief-predictions",no_argument,0,'b'},
4217 {"trim-protein-seq",required_argument,0,'B'},
4218 {"custom-tag",1,0,'c'},
4219 {"local-csq",0,0,'l'},
4220 {"gff-annot",1,0,'g'},
4221 {"fasta-ref",1,0,'f'},
4222 {"include",1,0,'i'},
4223 {"exclude",1,0,'e'},
4224 {"output",1,0,'o'},
4225 {"output-type",1,NULL,'O'},
4226 {"phase",1,0,'p'},
4227 {"quiet",0,0,'q'},
4228 {"verbose",1,0,'v'},
4229 {"regions",1,0,'r'},
4230 {"regions-file",1,0,'R'},
4231 {"regions-overlap",required_argument,NULL,4},
4232 {"samples",1,0,'s'},
4233 {"samples-file",1,0,'S'},
4234 {"targets",1,0,'t'},
4235 {"targets-file",1,0,'T'},
4236 {"targets-overlap",required_argument,NULL,5},
4237 {"no-version",no_argument,NULL,3},
4238 {0,0,0,0}
4239 };
4240 int c, targets_is_file = 0, regions_is_file = 0;
4241 int regions_overlap = 1;
4242 int targets_overlap = 0;
4243 char *targets_list = NULL, *regions_list = NULL, *tmp;
4244 while ((c = getopt_long(argc, argv, "?hr:R:t:T:i:e:f:o:O:g:s:S:p:qc:ln:bB:v:",loptions,NULL)) >= 0)
4245 {
4246 switch (c)
4247 {
4248 case 1 : args->force = 1; break;
4249 case 2 :
4250 args->n_threads = strtol(optarg,&tmp,10);
4251 if ( *tmp ) error("Could not parse argument: --threads %s\n", optarg);
4252 break;
4253 case 3 : args->record_cmd_line = 0; break;
4254 case 'b':
4255 args->brief_predictions = 1;
4256 fprintf(stderr,"Warning: the -b option will be removed in future versions. Please use -B 1 instead.\n");
4257 break;
4258 case 'B':
4259 args->brief_predictions = strtol(optarg,&tmp,10);
4260 if ( *tmp || args->brief_predictions<1 ) error("Could not parse argument: --trim-protein-seq %s\n", optarg);
4261 break;
4262 case 'l': args->local_csq = 1; break;
4263 case 'c': args->bcsq_tag = optarg; break;
4264 case 'q': error("Error: the -q option has been deprecated, use -v, --verbose instead.\n"); break;
4265 case 'v':
4266 args->verbosity = atoi(optarg);
4267 if ( args->verbosity<0 || args->verbosity>2 ) error("Error: expected integer 0-2 with -v, --verbose\n");
4268 break;
4269 case 'p':
4270 switch (optarg[0])
4271 {
4272 case 'a': args->phase = PHASE_AS_IS; break;
4273 case 'm': args->phase = PHASE_MERGE; break;
4274 case 'r': args->phase = PHASE_REQUIRE; break;
4275 case 'R': args->phase = PHASE_NON_REF; break;
4276 case 's': args->phase = PHASE_SKIP; break;
4277 default: error("The -p code \"%s\" not recognised\n", optarg);
4278 }
4279 break;
4280 case 'f': args->fa_fname = optarg; break;
4281 case 'g': args->gff_fname = optarg; break;
4282 case 'n':
4283 args->ncsq2_max = 2 * atoi(optarg);
4284 if ( args->ncsq2_max <= 0 ) error("Expected positive integer with -n, got %s\n", optarg);
4285 break;
4286 case 'o': args->output_fname = optarg; break;
4287 case 'O':
4288 switch (optarg[0]) {
4289 case 't': args->output_type = FT_TAB_TEXT; break;
4290 case 'b': args->output_type = FT_BCF_GZ; break;
4291 case 'u': args->output_type = FT_BCF; break;
4292 case 'z': args->output_type = FT_VCF_GZ; break;
4293 case 'v': args->output_type = FT_VCF; break;
4294 default:
4295 {
4296 args->clevel = strtol(optarg,&tmp,10);
4297 if ( *tmp || args->clevel<0 || args->clevel>9 ) error("The output type \"%s\" not recognised\n", optarg);
4298 }
4299 }
4300 if ( optarg[1] )
4301 {
4302 args->clevel = strtol(optarg+1,&tmp,10);
4303 if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --output-type %s\n", optarg+1);
4304 }
4305 break;
4306 case 'e':
4307 if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
4308 args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
4309 case 'i':
4310 if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
4311 args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
4312 case 'r': regions_list = optarg; break;
4313 case 'R': regions_list = optarg; regions_is_file = 1; break;
4314 case 's': args->sample_list = optarg; break;
4315 case 'S': args->sample_list = optarg; args->sample_is_file = 1; break;
4316 case 't': targets_list = optarg; break;
4317 case 'T': targets_list = optarg; targets_is_file = 1; break;
4318 case 4 :
4319 if ( !strcasecmp(optarg,"0") ) regions_overlap = 0;
4320 else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1;
4321 else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2;
4322 else error("Could not parse: --regions-overlap %s\n",optarg);
4323 break;
4324 case 5 :
4325 if ( !strcasecmp(optarg,"0") ) targets_overlap = 0;
4326 else if ( !strcasecmp(optarg,"1") ) targets_overlap = 1;
4327 else if ( !strcasecmp(optarg,"2") ) targets_overlap = 2;
4328 else error("Could not parse: --targets-overlap %s\n",optarg);
4329 break;
4330 case 'h':
4331 case '?': error("%s",usage());
4332 default: error("The option not recognised: %s\n\n", optarg); break;
4333 }
4334 }
4335 char *fname = NULL;
4336 if ( optind==argc )
4337 {
4338 if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin
4339 else error("%s", usage());
4340 }
4341 else fname = argv[optind];
4342 if ( argc - optind>1 ) error("%s", usage());
4343 if ( !args->fa_fname ) error("Missing the --fa-ref option\n");
4344 if ( !args->gff_fname ) error("Missing the --gff option\n");
4345 args->sr = bcf_sr_init();
4346 if ( targets_list )
4347 {
4348 bcf_sr_set_opt(args->sr,BCF_SR_TARGETS_OVERLAP,targets_overlap);
4349 if ( bcf_sr_set_targets(args->sr, targets_list, targets_is_file, 0)<0 )
4350 error("Failed to read the targets: %s\n", targets_list);
4351 }
4352 if ( regions_list )
4353 {
4354 bcf_sr_set_opt(args->sr,BCF_SR_REGIONS_OVERLAP,regions_overlap);
4355 if ( bcf_sr_set_regions(args->sr, regions_list, regions_is_file)<0 )
4356 error("Failed to read the regions: %s\n", regions_list);
4357 }
4358 if ( bcf_sr_set_threads(args->sr, args->n_threads)<0 ) error("Failed to create %d extra threads\n", args->n_threads);
4359 if ( !bcf_sr_add_reader(args->sr, fname) )
4360 error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->sr->errnum));
4361 args->hdr = bcf_sr_get_header(args->sr,0);
4362
4363 init_data(args);
4364 while ( bcf_sr_next_line(args->sr) )
4365 {
4366 process(args, &args->sr->readers[0].buffer[0]);
4367 }
4368 process(args,NULL);
4369
4370 destroy_data(args);
4371 bcf_sr_destroy(args->sr);
4372 free(args);
4373 return 0;
4374 }
4375
4376