1 /* The MIT License
2 
3    Copyright (c) 2016-2021 Genome Research Ltd.
4 
5    Author: Petr Danecek <pd3@sanger.ac.uk>
6 
7    Permission is hereby granted, free of charge, to any person obtaining a copy
8    of this software and associated documentation files (the "Software"), to deal
9    in the Software without restriction, including without limitation the rights
10    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11    copies of the Software, and to permit persons to whom the Software is
12    furnished to do so, subject to the following conditions:
13 
14    The above copyright notice and this permission notice shall be included in
15    all copies or substantial portions of the Software.
16 
17    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23    THE SOFTWARE.
24 
25  */
26 /*
27     Things that would be nice to have
28         - dynamic N_REF_PAD
29         - for stop-lost events (also in frameshifts) report the number of truncated aa's
30         - memory could be greatly reduced by indexing gff (but it is quite compact already)
31         - deletions that go beyond transcript boundaries are not checked at sequence level
32             - alloc tscript->ref in hap_finalize, introduce fa_off_beg:16,fa_off_end:16
33             - see test/csq/ENST00000573314/insertion-overlap.vcf #1476288882
34 
35     Read about transcript types here
36         http://vega.sanger.ac.uk/info/about/gene_and_transcript_types.html
37         http://www.ensembl.org/info/genome/variation/predicted_data.html
38         http://www.gencodegenes.org/gencode_biotypes.html
39 
40     List of supported biotypes
41         antisense
42         IG_C_gene
43         IG_D_gene
44         IG_J_gene
45         IG_LV_gene
46         IG_V_gene
47         lincRNA
48         macro_lncRNA
49         miRNA
50         misc_RNA
51         Mt_rRNA
52         Mt_tRNA
53         polymorphic_pseudogene
54         processed_transcript
55         protein_coding
56         ribozyme
57         rRNA
58         sRNA
59         scRNA
60         scaRNA
61         sense_intronic
62         sense_overlapping
63         snRNA
64         snoRNA
65         TR_C_gene
66         TR_D_gene
67         TR_J_gene
68         TR_V_gene
69 
70     The gff parsing logic
71         We collect features such by combining gff lines A,B,C as follows:
72             A .. gene line with a supported biotype
73                     A.ID=~/^gene:/
74 
75             B .. transcript line referencing A with supported biotype
76                     B.ID=~/^transcript:/ && B.Parent=~/^gene:A.ID/
77 
78             C .. corresponding CDS, exon, and UTR lines:
79                     C[3] in {"CDS","exon","three_prime_UTR","five_prime_UTR"} && C.Parent=~/^transcript:B.ID/
80 
81         For coding biotypes ("protein_coding" or "polymorphic_pseudogene") the
82         complete chain link C -> B -> A is required. For the rest, link B -> A suffices.
83 
84 
85     The supported consequence types, sorted by impact:
86         splice_acceptor_variant .. end region of an intron changed (2bp at the 3' end of an intron)
87         splice_donor_variant    .. start region of an intron changed (2bp at the 5' end of an intron)
88         stop_gained             .. DNA sequence variant resulting in a stop codon
89         frameshift_variant      .. number of inserted/deleted bases not a multiple of three, disrupted translational frame
90         stop_lost               .. elongated transcript, stop codon changed
91         start_lost              .. the first codon changed
92         inframe_altering        .. combination of indels leading to unchanged reading frame and length
93         inframe_insertion       .. inserted coding sequence, unchanged reading frame
94         inframe_deletion        .. deleted coding sequence, unchanged reading frame
95         missense_variant        .. amino acid (aa) change, unchanged length
96         splice_region_variant   .. change within 1-3 bases of the exon or 3-8 bases of the intron
97         synonymous_variant      .. DNA sequence variant resulting in no amino acid change
98         stop_retained_variant   .. different stop codon
99         start_retained_variant  .. start codon retained by indel realignment
100         non_coding_variant      .. variant in non-coding sequence, such as RNA gene
101         5_prime_UTR_variant
102         3_prime_UTR_variant
103         intron_variant          .. reported only if none of the above
104         intergenic_variant      .. reported only if none of the above
105 
106 
107     The annotation algorithm.
108         The algorithm checks if the variant falls in a region of a supported type. The
109         search is performed in the following order, until a match is found:
110             1. idx_cds(gf_cds_t) - lookup CDS by position, create haplotypes, call consequences
111             2. idx_utr(gf_utr_t) - check UTR hits
112             3. idx_exon(gf_exon_t) - check for splice variants
113             4. idx_tscript(tscript_t) - check for intronic variants, RNAs, etc.
114 
115         These regidx indexes are created by parsing a gff3 file as follows:
116             1.  create the array "ftr" of all UTR, CDS, exons. This will be
117             processed later and pruned based on transcript types we want to keep.
118             In the same go, create the hash "id2tr" of transcripts to keep
119             (based on biotype) which maps from transcript_id to a transcript. At
120             the same time also build the hash "gid2gene" which maps from gene_id to
121             gf_gene_t pointer.
122 
123             2.  build "idx_cds", "idx_tscript", "idx_utr" and "idx_exon" indexes.
124             Use only features from "ftr" which are present in "id2tr".
125 
126             3.  clean data that won't be needed anymore: ftr, id2tr, gid2gene.
127 
128     Data structures.
129         idx_cds, idx_utr, idx_exon, idx_tscript:
130             as described above, regidx structures for fast lookup of exons/transcripts
131             overlapping a region, the payload is a pointer to tscript.cds
132 */
133 
134 #include <stdio.h>
135 #include <stdlib.h>
136 #include <assert.h>
137 #include <getopt.h>
138 #include <math.h>
139 #include <inttypes.h>
140 #include <htslib/hts.h>
141 #include <htslib/vcf.h>
142 #include <htslib/synced_bcf_reader.h>
143 #include <htslib/khash.h>
144 #include <htslib/khash_str2int.h>
145 #include <htslib/kseq.h>
146 #include <htslib/faidx.h>
147 #include <errno.h>
148 #include <unistd.h>
149 #include <ctype.h>
150 #include "bcftools.h"
151 #include "filter.h"
152 #include "regidx.h"
153 #include "kheap.h"
154 #include "smpl_ilist.h"
155 #include "rbuf.h"
156 
157 #ifndef __FUNCTION__
158 #  define __FUNCTION__ __func__
159 #endif
160 
161 // Logic of the filters: include or exclude sites which match the filters?
162 #define FLT_INCLUDE 1
163 #define FLT_EXCLUDE 2
164 
165 // Definition of splice_region, splice_acceptor and splice_donor
166 #define N_SPLICE_DONOR         2
167 #define N_SPLICE_REGION_EXON   3
168 #define N_SPLICE_REGION_INTRON 8
169 
170 #define N_REF_PAD 10    // number of bases to avoid boundary effects
171 
172 #define STRAND_REV 0
173 #define STRAND_FWD 1
174 
175 #define TRIM_NONE   0
176 #define TRIM_5PRIME 1
177 #define TRIM_3PRIME 2
178 
179 // How to treat phased/unphased genotypes
180 #define PHASE_REQUIRE 0     // --phase r
181 #define PHASE_MERGE   1     // --phase m
182 #define PHASE_AS_IS   2     // --phase a
183 #define PHASE_SKIP    3     // --phase s
184 #define PHASE_NON_REF 4     // --phase R
185 #define PHASE_DROP_GT 5     // --samples -
186 
187 // Node types in the haplotype tree
188 #define HAP_CDS   0
189 #define HAP_ROOT  1
190 #define HAP_SSS   2     // start/stop/splice
191 
192 #define CSQ_PRINTED_UPSTREAM    (1<<0)
193 #define CSQ_SYNONYMOUS_VARIANT  (1<<1)
194 #define CSQ_MISSENSE_VARIANT    (1<<2)
195 #define CSQ_STOP_LOST           (1<<3)
196 #define CSQ_STOP_GAINED         (1<<4)
197 #define CSQ_INFRAME_DELETION    (1<<5)
198 #define CSQ_INFRAME_INSERTION   (1<<6)
199 #define CSQ_FRAMESHIFT_VARIANT  (1<<7)
200 #define CSQ_SPLICE_ACCEPTOR     (1<<8)
201 #define CSQ_SPLICE_DONOR        (1<<9)
202 #define CSQ_START_LOST          (1<<10)
203 #define CSQ_SPLICE_REGION       (1<<11)
204 #define CSQ_STOP_RETAINED       (1<<12)
205 #define CSQ_UTR5                (1<<13)
206 #define CSQ_UTR3                (1<<14)
207 #define CSQ_NON_CODING          (1<<15)
208 #define CSQ_INTRON              (1<<16)
209 //#define CSQ_INTERGENIC          (1<<17)
210 #define CSQ_INFRAME_ALTERING    (1<<18)
211 #define CSQ_UPSTREAM_STOP       (1<<19)     // adds * in front of the csq string
212 #define CSQ_INCOMPLETE_CDS      (1<<20)     // to remove START/STOP in incomplete CDS, see ENSG00000173376/synon.vcf
213 #define CSQ_CODING_SEQUENCE     (1<<21)     // cannot tell exactly what it is, but it does affect the coding sequence
214 #define CSQ_ELONGATION          (1<<22)     // symbolic insertion
215 #define CSQ_START_RETAINED      (1<<23)
216 
217 // Haplotype-aware consequences, printed in one vcf record only, the rest has a reference @12345
218 #define CSQ_COMPOUND (CSQ_SYNONYMOUS_VARIANT|CSQ_MISSENSE_VARIANT|CSQ_STOP_LOST|CSQ_STOP_GAINED| \
219                       CSQ_INFRAME_DELETION|CSQ_INFRAME_INSERTION|CSQ_FRAMESHIFT_VARIANT| \
220                       CSQ_START_LOST|CSQ_STOP_RETAINED|CSQ_INFRAME_ALTERING|CSQ_INCOMPLETE_CDS| \
221                       CSQ_UPSTREAM_STOP|CSQ_START_RETAINED)
222 #define CSQ_START_STOP          (CSQ_STOP_LOST|CSQ_STOP_GAINED|CSQ_STOP_RETAINED|CSQ_START_LOST|CSQ_START_RETAINED)
223 
224 #define CSQ_PRN_STRAND(csq)     ((csq)&CSQ_COMPOUND && !((csq)&(CSQ_SPLICE_ACCEPTOR|CSQ_SPLICE_DONOR|CSQ_SPLICE_REGION)))
225 #define CSQ_PRN_TSCRIPT         (~(CSQ_INTRON|CSQ_NON_CODING))
226 #define CSQ_PRN_BIOTYPE         CSQ_NON_CODING
227 
228 // see kput_vcsq()
229 const char *csq_strings[] =
230 {
231     NULL,
232     "synonymous",
233     "missense",
234     "stop_lost",
235     "stop_gained",
236     "inframe_deletion",
237     "inframe_insertion",
238     "frameshift",
239     "splice_acceptor",
240     "splice_donor",
241     "start_lost",
242     "splice_region",
243     "stop_retained",
244     "5_prime_utr",
245     "3_prime_utr",
246     "non_coding",
247     "intron",
248     "intergenic",
249     "inframe_altering",
250     NULL,
251     NULL,
252     "coding_sequence",
253     "feature_elongation",
254     "start_retained"
255 };
256 
257 
258 // GFF line types
259 #define GFF_TSCRIPT_LINE 1
260 #define GFF_GENE_LINE    2
261 
262 
263 /*
264     Genomic features, for fast lookup by position to overlapping features
265 */
266 #define GF_coding_bit 6
267 #define GF_is_coding(x) ((x) & (1<<GF_coding_bit))
268 #define GF_MT_rRNA                       1                      // non-coding: 1, 2, ...
269 #define GF_MT_tRNA                       2
270 #define GF_lincRNA                       3
271 #define GF_miRNA                         4
272 #define GF_MISC_RNA                      5
273 #define GF_rRNA                          6
274 #define GF_snRNA                         7
275 #define GF_snoRNA                        8
276 #define GF_PROCESSED_TRANSCRIPT          9
277 #define GF_ANTISENSE                    10
278 #define GF_macro_lncRNA                 11
279 #define GF_ribozyme                     12
280 #define GF_sRNA                         13
281 #define GF_scRNA                        14
282 #define GF_scaRNA                       15
283 #define GF_SENSE_INTRONIC               16
284 #define GF_SENSE_OVERLAPPING            17
285 #define GF_PSEUDOGENE                   18
286 #define GF_PROCESSED_PSEUDOGENE         19
287 #define GF_ARTIFACT                     20
288 #define GF_IG_PSEUDOGENE                21
289 #define GF_IG_C_PSEUDOGENE              22
290 #define GF_IG_J_PSEUDOGENE              23
291 #define GF_IG_V_PSEUDOGENE              24
292 #define GF_TR_V_PSEUDOGENE              25
293 #define GF_TR_J_PSEUDOGENE              26
294 #define GF_MT_tRNA_PSEUDOGENE           27
295 #define GF_misc_RNA_PSEUDOGENE          28
296 #define GF_miRNA_PSEUDOGENE             29
297 #define GF_RIBOZYME                     30
298 #define GF_RETAINED_INTRON              31
299 #define GF_RETROTRANSPOSED              32
300 #define GF_tRNA_PSEUDOGENE              33
301 #define GF_TRANSCRIBED_PROCESSED_PSEUDOGENE     34
302 #define GF_TRANSCRIBED_UNPROCESSED_PSEUDOGENE   35
303 #define GF_TRANSCRIBED_UNITARY_PSEUDOGENE       36
304 #define GF_TRANSLATED_UNPROCESSED_PSEUDOGENE    37
305 #define GF_TRANSLATED_PROCESSED_PSEUDOGENE      38
306 #define GF_KNOWN_NCRNA                          39
307 #define GF_UNITARY_PSEUDOGENE                   40
308 #define GF_UNPROCESSED_PSEUDOGENE               41
309 #define GF_LRG_GENE                             42
310 #define GF_3PRIME_OVERLAPPING_ncRNA             43
311 #define GF_DISRUPTED_DOMAIN                     44
312 #define GF_vaultRNA                             45
313 #define GF_BIDIRECTIONAL_PROMOTER_lncRNA        46
314 #define GF_AMBIGUOUS_ORF                        47
315 #define GF_PROTEIN_CODING               (1|(1<<GF_coding_bit))  // coding: 65, 66, ...
316 #define GF_POLYMORPHIC_PSEUDOGENE       (2|(1<<GF_coding_bit))
317 #define GF_IG_C                         (3|(1<<GF_coding_bit))
318 #define GF_IG_D                         (4|(1<<GF_coding_bit))
319 #define GF_IG_J                         (5|(1<<GF_coding_bit))
320 #define GF_IG_LV                        (6|(1<<GF_coding_bit))
321 #define GF_IG_V                         (7|(1<<GF_coding_bit))
322 #define GF_TR_C                         (8|(1<<GF_coding_bit))
323 #define GF_TR_D                         (9|(1<<GF_coding_bit))
324 #define GF_TR_J                        (10|(1<<GF_coding_bit))
325 #define GF_TR_V                        (11|(1<<GF_coding_bit))
326 #define GF_NMD                         (12|(1<<GF_coding_bit))
327 #define GF_NON_STOP_DECAY              (13|(1<<GF_coding_bit))
328 #define GF_CDS      ((1<<(GF_coding_bit+1))+1)                  // special types: 129, 130, ...
329 #define GF_EXON     ((1<<(GF_coding_bit+1))+2)
330 #define GF_UTR3     ((1<<(GF_coding_bit+1))+3)
331 #define GF_UTR5     ((1<<(GF_coding_bit+1))+4)
332 // GF_MAX = (1<<30)-1, see hap_node_t
333 
334 typedef struct _tscript_t tscript_t;
335 typedef struct
336 {
337     tscript_t *tr;      // transcript
338     uint32_t beg;       // the start coordinate of the CDS (on the reference strand, 0-based)
339     uint32_t pos;       // 0-based index of the first exon base within the transcript (only to
340                         //  update hap_node_t.sbeg in hap_init, could be calculated on the fly)
341     uint32_t len;       // exon length
342     uint32_t icds:30,   // exon index within the transcript
343              phase:2;   // offset of the CDS
344 }
345 gf_cds_t;
346 typedef struct
347 {
348     char *name;           // human readable name, e.g. ORF45
349     uint32_t iseq;
350 }
351 gf_gene_t;
352 typedef struct
353 {
354     uint32_t beg,end;
355     tscript_t *tr;
356 }
357 gf_exon_t;
358 typedef enum { prime3, prime5 } utr_t;
359 typedef struct
360 {
361     utr_t which;
362     uint32_t beg,end;
363     tscript_t *tr;
364 }
365 gf_utr_t;
366 
367 
368 /*
369     Structures related to VCF output:
370 
371     vcsq_t
372         information required to assemble consequence lines such as "inframe_deletion|XYZ|ENST01|+|5TY>5I|121ACG>A+124TA>T"
373 
374     vrec_t
375         single VCF record and csq tied to this record. (Haplotype can have multiple
376         consequences in several VCF records. Each record can have multiple consequences
377         from multiple haplotypes.)
378 
379     csq_t
380         a top-level consequence tied to a haplotype
381 
382     vbuf_t
383     pos2vbuf
384         VCF records with the same position clustered together for a fast lookup via pos2vbuf
385 */
386 typedef struct _vbuf_t vbuf_t;
387 typedef struct _vcsq_t vcsq_t;
388 struct _vcsq_t
389 {
390     uint32_t strand:1,
391              type:31;   // one of CSQ_* types
392     uint32_t trid;
393     uint32_t vcf_ial;
394     uint32_t biotype;   // one of GF_* types
395     char *gene;         // gene name
396     bcf1_t *ref;        // if type&CSQ_PRINTED_UPSTREAM, ref consequence "@1234"
397     kstring_t vstr;     // variant string, eg 5TY>5I|121ACG>A+124TA>T
398 };
399 typedef struct
400 {
401     bcf1_t *line;
402     uint32_t *fmt_bm;   // bitmask of sample consequences with first/second haplotype interleaved
403     uint32_t nfmt:4,    // the bitmask size (the number of integers per sample)
404              nvcsq:28, mvcsq;
405     vcsq_t *vcsq;       // there can be multiple consequences for a single VCF record
406 }
407 vrec_t;
408 typedef struct
409 {
410     uint32_t pos;
411     vrec_t *vrec;   // vcf line that this csq is tied to; needed when printing haplotypes (hap_stage_vcf)
412     int idx;        // 0-based index of the csq at the VCF line, for FMT/BCSQ
413     vcsq_t type;
414 }
415 csq_t;
416 struct _vbuf_t
417 {
418     vrec_t **vrec;   // buffer of VCF lines with the same position
419     int n, m;
420     uint32_t keep_until;    // the maximum transcript end position
421 };
422 KHASH_MAP_INIT_INT(pos2vbuf, vbuf_t*)
423 
424 
425 /*
426     Structures related to haplotype-aware consequences in coding regions
427 
428     hap_node_t
429         node of a haplotype tree. Each transcript has one tree
430 
431     tscript_t
432         despite its general name, it is intended for coding transcripts only
433 
434     hap_t
435     hstack_t
436         for traversal of the haplotype tree and braking combined
437         consequences into independent parts
438 */
439 typedef struct _hap_node_t hap_node_t;
440 struct _hap_node_t
441 {
442     char *seq;          // cds segment [parent_node,this_node)
443     char *var;          // variant "ref>alt"
444     uint32_t type:2,    // HAP_ROOT or HAP_CDS
445              csq:30;    // this node's consequence
446     int dlen;           // alt minus ref length: <0 del, >0 ins, 0 substitution
447     uint32_t rbeg;      // variant's VCF position (0-based, inclusive)
448     int32_t rlen;       // variant's rlen; alen=rlen+dlen; fake for non CDS types
449     uint32_t sbeg;      // variant's position on the spliced reference transcript (0-based, inclusive, N_REF_PAD not included)
450     uint32_t icds;      // which exon does this node's variant overlaps
451     hap_node_t **child, *prev;  // children haplotypes and previous coding node
452     int nchild, mchild;
453     bcf1_t *cur_rec, *rec;      // current VCF record and node's VCF record
454     int vcf_ial;                // which VCF allele generated this node
455     uint32_t nend;              // number of haplotypes ending in this node
456     int *cur_child, mcur_child; // mapping from the allele to the currently active child
457     csq_t *csq_list;            // list of haplotype's consequences, broken by position (each corresponds to a VCF record)
458     int ncsq_list, mcsq_list;
459 };
460 struct _tscript_t
461 {
462     uint32_t id;        // transcript id
463     uint32_t beg,end;   // transcript's beg and end coordinate (ref strand, 0-based, inclusive)
464     uint32_t strand:1,  // STRAND_REV or STRAND_FWD
465              ncds:31,   // number of exons
466              mcds;
467     gf_cds_t **cds;     // ordered list of exons
468     char *ref;          // reference sequence, padded with N_REF_PAD bases on both ends
469     char *sref;         // spliced reference sequence, padded with N_REF_PAD bases on both ends
470     hap_node_t *root;   // root of the haplotype tree
471     hap_node_t **hap;   // pointer to haplotype leaves, two for each sample
472     int nhap, nsref;    // number of haplotypes and length of sref, including 2*N_REF_PAD
473     uint32_t trim:2,    // complete, 5' or 3' trimmed, see TRIM_* types
474              type:30;   // one of GF_* types
475     gf_gene_t *gene;
476 };
cmp_tscript(tscript_t ** a,tscript_t ** b)477 static inline int cmp_tscript(tscript_t **a, tscript_t **b)
478 {
479     return ( (*a)->end  < (*b)->end ) ? 1 : 0;
480 }
481 KHEAP_INIT(trhp, tscript_t*, cmp_tscript)
482 typedef khp_trhp_t tr_heap_t;
483 typedef struct
484 {
485     hap_node_t *node;   // current node
486     int ichild;         // current child in the active node
487     int dlen;           // total dlen, from the root to the active node
488     size_t slen;        // total sequence length, from the root to the active node
489 }
490 hstack_t;
491 typedef struct
492 {
493     int mstack;
494     hstack_t *stack;
495     tscript_t *tr;      // tr->ref: spliced transcript on ref strand
496     kstring_t sseq;     // spliced haplotype sequence on ref strand
497     kstring_t tseq;     // the variable part of translated haplotype transcript, coding strand
498     kstring_t tref;     // the variable part of translated reference transcript, coding strand
499     uint32_t sbeg;      // stack's sbeg, for cases first node's type is HAP_SSS
500     int upstream_stop;
501 }
502 hap_t;
503 
504 
505 /*
506     Helper structures, only for initialization
507 
508     ftr_t
509         temporary list of all exons, CDS, UTRs
510 */
511 KHASH_MAP_INIT_INT(int2tscript, tscript_t*)
512 KHASH_MAP_INIT_INT(int2gene, gf_gene_t*)
513 typedef struct
514 {
515     int type;       // GF_CDS, GF_EXON, GF_5UTR, GF_3UTR
516     uint32_t beg;
517     uint32_t end;
518     uint32_t trid;
519     uint32_t strand:1;   // STRAND_REV,STRAND_FWD
520     uint32_t phase:2;    // 0, 1 or 2
521     uint32_t iseq:29;
522 }
523 ftr_t;
524 /*
525     Mapping from GFF ID string (such as ENST00000450305 or Zm00001d027230_P001)
526     to integer id.  To keep the memory requirements low, the original version
527     relied on IDs in the form of a string prefix and a numerical id.  However,
528     it turns out that this assumption is not valid for some ensembl GFFs, see
529     for example Zea_mays.AGPv4.36.gff3.gz
530  */
531 typedef struct
532 {
533     void *str2id;       // khash_str2int
534     int nstr, mstr;
535     char **str;         // numeric id to string
536 }
537 id_tbl_t;
538 typedef struct
539 {
540     // all exons, CDS, UTRs
541     ftr_t *ftr;
542     int nftr, mftr;
543 
544     // mapping from gene id to gf_gene_t
545     kh_int2gene_t *gid2gene;
546 
547     // mapping from transcript id to tscript, for quick CDS anchoring
548     kh_int2tscript_t *id2tr;
549 
550     // sequences
551     void *seq2int;  // str2int hash
552     char **seq;
553     int nseq, mseq;
554 
555     // ignored biotypes
556     void *ignored_biotypes;
557 
558     id_tbl_t gene_ids;   // temporary table for mapping between gene id (eg. Zm00001d027245) and a numeric idx
559 }
560 aux_t;
561 
562 typedef struct _args_t
563 {
564     // the main regidx lookups, from chr:beg-end to overlapping features and
565     // index iterator
566     regidx_t *idx_cds, *idx_utr, *idx_exon, *idx_tscript;
567     regitr_t *itr;
568 
569     // temporary structures, deleted after initializtion
570     aux_t init;
571 
572     // text tab-delimited output (out) or vcf/bcf output (out_fh)
573     FILE *out;
574     htsFile *out_fh;
575 
576     // vcf
577     bcf_srs_t *sr;
578     bcf_hdr_t *hdr;
579     int hdr_nsmpl;          // actual number of samples in the vcf, for bcf_update_format_values()
580 
581     // include or exclude sites which match the filters
582     filter_t *filter;
583     char *filter_str;
584     int filter_logic;       // FLT_INCLUDE or FLT_EXCLUDE
585 
586     // samples to process
587     int sample_is_file;
588     char *sample_list;
589     smpl_ilist_t *smpl;
590 
591     char *outdir, **argv, *fa_fname, *gff_fname, *output_fname;
592     char *bcsq_tag;
593     int argc, output_type, clevel;
594     int phase, verbosity, local_csq, record_cmd_line;
595     int ncsq2_max, nfmt_bcsq;   // maximum number of csq per site that can be accessed from FORMAT/BCSQ (*2 and 1 bit skipped to avoid BCF missing values)
596     int ncsq2_small_warned;
597     int brief_predictions;
598 
599     int rid;                    // current chromosome
600     tr_heap_t *active_tr;       // heap of active transcripts for quick flushing
601     hap_t *hap;                 // transcript haplotype recursion
602     vbuf_t **vcf_buf;           // buffered VCF lines to annotate with CSQ and flush
603     rbuf_t vcf_rbuf;            // round buffer indexes to vcf_buf
604     kh_pos2vbuf_t *pos2vbuf;    // fast lookup of buffered lines by position
605     tscript_t **rm_tr;          // buffer of transcripts to clean
606     int nrm_tr, mrm_tr;
607     csq_t *csq_buf;             // pool of csq not managed by hap_node_t, i.e. non-CDS csqs
608     int ncsq_buf, mcsq_buf;
609     id_tbl_t tscript_ids;       // mapping between transcript id (eg. Zm00001d027245_T001) and a numeric idx
610     int force;                  // force run under various conditions. Currently only to skip out-of-phase transcripts
611     int n_threads;              // extra compression/decompression threads
612 
613     faidx_t *fai;
614     kstring_t str, str2;
615     int32_t *gt_arr, mgt_arr;
616 }
617 args_t;
618 
619 // AAA, AAC, ...
620 const char *gencode = "KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSS*CWCLFLF";
621 const uint8_t nt4[] =
622 {
623     4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4,
624     4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4,
625     4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4,
626     4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4,
627     4,0,4,1, 4,4,4,2, 4,4,4,4, 4,4,4,4,
628     4,4,4,4, 3,4,4,4, 4,4,4,4, 4,4,4,4,
629     4,0,4,1, 4,4,4,2, 4,4,4,4, 4,4,4,4,
630     4,4,4,4, 3
631 };
632 const uint8_t cnt4[] =
633 {
634     4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4,
635     4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4,
636     4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4,
637     4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4,
638     4,3,4,2, 4,4,4,1, 4,4,4,4, 4,4,4,4,
639     4,4,4,4, 0,4,4,4, 4,4,4,4, 4,4,4,4,
640     4,3,4,2, 4,4,4,1, 4,4,4,4, 4,4,4,4,
641     4,4,4,4, 0
642 };
643 #define dna2aa(x)  gencode[  nt4[(uint8_t)(x)[0]]<<4 |  nt4[(uint8_t)(x)[1]]<<2 |  nt4[(uint8_t)(x)[2]] ]
644 #define cdna2aa(x) gencode[ cnt4[(uint8_t)(x)[2]]<<4 | cnt4[(uint8_t)(x)[1]]<<2 | cnt4[(uint8_t)(x)[0]] ]
645 
646 static const char *gf_strings_noncoding[] =
647 {
648     "MT_rRNA", "MT_tRNA", "lincRNA", "miRNA", "misc_RNA", "rRNA", "snRNA", "snoRNA", "processed_transcript",
649     "antisense", "macro_lncRNA", "ribozyme", "sRNA", "scRNA", "scaRNA", "sense_intronic", "sense_overlapping",
650     "pseudogene", "processed_pseudogene", "artifact", "IG_pseudogene", "IG_C_pseudogene", "IG_J_pseudogene",
651     "IG_V_pseudogene", "TR_V_pseudogene", "TR_J_pseudogene", "MT_tRNA_pseudogene", "misc_RNA_pseudogene",
652     "miRNA_pseudogene", "ribozyme", "retained_intron", "retrotransposed", "Trna_pseudogene", "transcribed_processed_pseudogene",
653     "transcribed_unprocessed_pseudogene", "transcribed_unitary_pseudogene",    "translated_unprocessed_pseudogene",
654     "translated_processed_pseudogene", "known_ncRNA", "unitary_pseudogene", "unprocessed_pseudogene",
655     "LRG_gene", "3_prime_overlapping_ncRNA", "disrupted_domain", "vaultRNA", "bidirectional_promoter_lncRNA", "ambiguous_orf"
656 };
657 static const char *gf_strings_coding[] = { "protein_coding", "polymorphic_pseudogene", "IG_C", "IG_D", "IG_J", "IG_LV", "IG_V", "TR_C", "TR_D", "TR_J", "TR_V", "NMD", "non_stop_decay"};
658 static const char *gf_strings_special[] = { "CDS", "exon", "3_prime_UTR", "5_prime_UTR" };
659 
gf_type2gff_string(int type)660 const char *gf_type2gff_string(int type)
661 {
662     if ( !GF_is_coding(type) )
663     {
664         if ( type < (1<<GF_coding_bit) ) return gf_strings_noncoding[type-1];
665         type &= (1<<(GF_coding_bit+1)) - 1;
666         return gf_strings_special[type - 1];
667     }
668     type &= (1<<GF_coding_bit) - 1;
669     return gf_strings_coding[type - 1];
670 }
671 
672 /*
673     gff parsing functions
674 */
feature_set_seq(args_t * args,char * chr_beg,char * chr_end)675 static inline int feature_set_seq(args_t *args, char *chr_beg, char *chr_end)
676 {
677     aux_t *aux = &args->init;
678     char c = chr_end[1];
679     chr_end[1] = 0;
680     int iseq;
681     if ( khash_str2int_get(aux->seq2int, chr_beg, &iseq)!=0 )
682     {
683         // check for possible mismatch in chromosome naming convention such as chrX vs X
684         char *new_chr = NULL;
685         if ( faidx_has_seq(args->fai,chr_beg) )
686             new_chr = strdup(chr_beg);                  // valid chr name, the same in gff and faidx
687         else
688         {
689             int len = strlen(chr_beg);
690             if ( !strncmp("chr",chr_beg,3) && len>3 )
691                 new_chr = strdup(chr_beg+3);            // gff has the prefix, faidx does not
692             else
693             {
694                 new_chr = malloc(len+4);                // gff does not have the prefix, faidx has
695                 memcpy(new_chr,"chr",3);
696                 memcpy(new_chr+3,chr_beg,len);
697                 new_chr[len+3] = 0;
698             }
699             if ( !faidx_has_seq(args->fai,new_chr) )    // modification did not help, this sequence is not in fai
700             {
701                 static int unkwn_chr_warned = 0;
702                 if ( !unkwn_chr_warned && args->verbosity>0 )
703                     fprintf(stderr,"Warning: GFF chromosome \"%s\" not part of the reference genome\n",chr_beg);
704                 unkwn_chr_warned = 1;
705                 free(new_chr);
706                 new_chr = strdup(chr_beg);              // use the original sequence name
707             }
708         }
709         if ( khash_str2int_get(aux->seq2int, new_chr, &iseq)!=0 )
710         {
711             hts_expand(char*, aux->nseq+1, aux->mseq, aux->seq);
712             aux->seq[aux->nseq] = new_chr;
713             iseq = khash_str2int_inc(aux->seq2int, aux->seq[aux->nseq]);
714             aux->nseq++;
715             assert( aux->nseq < 1<<29 );  // see gf_gene_t.iseq and ftr_t.iseq
716         }
717         else
718             free(new_chr);
719     }
720     chr_end[1] = c;
721     return iseq;
722 }
gff_skip(const char * line,char * ss)723 static inline char *gff_skip(const char *line, char *ss)
724 {
725     while ( *ss && *ss!='\t' ) ss++;
726     if ( !*ss ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
727     return ss+1;
728 }
gff_parse_chr(const char * line,char ** chr_beg,char ** chr_end)729 static inline void gff_parse_chr(const char *line, char **chr_beg, char **chr_end)
730 {
731     char *se = (char*) line;
732     while ( *se && *se!='\t' ) se++;
733     if ( !*se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
734     *chr_beg = (char*) line;
735     *chr_end = se-1;
736 }
gff_parse_beg_end(const char * line,char * ss,uint32_t * beg,uint32_t * end)737 static inline char *gff_parse_beg_end(const char *line, char *ss, uint32_t *beg, uint32_t *end)
738 {
739     char *se = ss;
740     *beg = strtol(ss, &se, 10) - 1;
741     if ( ss==se ) error("[%s:%d %s] Could not parse the line:\n\t%s\n\t%s\n",__FILE__,__LINE__,__FUNCTION__,line,ss);
742     ss = se+1;
743     *end = strtol(ss, &se, 10) - 1;
744     if ( ss==se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
745     return se+1;
746 }
gff_id_init(id_tbl_t * tbl)747 static void gff_id_init(id_tbl_t *tbl)
748 {
749     memset(tbl, 0, sizeof(*tbl));
750     tbl->str2id = khash_str2int_init();
751 }
gff_id_destroy(id_tbl_t * tbl)752 static void gff_id_destroy(id_tbl_t *tbl)
753 {
754     khash_str2int_destroy_free(tbl->str2id);
755     free(tbl->str);
756 }
gff_id_parse(id_tbl_t * tbl,const char * line,const char * needle,char * ss)757 static inline uint32_t gff_id_parse(id_tbl_t *tbl, const char *line, const char *needle, char *ss)
758 {
759     ss = strstr(ss,needle);     // e.g. "ID=transcript:"
760     if ( !ss ) error("[%s:%d %s] Could not parse the line, \"%s\" not present: %s\n",__FILE__,__LINE__,__FUNCTION__,needle,line);
761     ss += strlen(needle);
762 
763     char *se = ss;
764     while ( *se && *se!=';' && !isspace(*se) ) se++;
765     char tmp = *se;
766     *se = 0;
767 
768     int id;
769     if ( khash_str2int_get(tbl->str2id, ss, &id) < 0 )
770     {
771         id = tbl->nstr++;
772         hts_expand(char*, tbl->nstr, tbl->mstr, tbl->str);
773         tbl->str[id] = strdup(ss);
774         khash_str2int_set(tbl->str2id, tbl->str[id], id);
775     }
776     *se = tmp;
777 
778     return id;
779 }
gff_parse_type(char * line)780 static inline int gff_parse_type(char *line)
781 {
782     line = strstr(line,"ID=");
783     if ( !line ) return -1;
784     line += 3;
785     if ( !strncmp(line,"transcript:",11) ) return GFF_TSCRIPT_LINE;
786     else if ( !strncmp(line,"gene:",5) ) return GFF_GENE_LINE;
787     return -1;
788 }
gff_parse_biotype(char * _line)789 static inline int gff_parse_biotype(char *_line)
790 {
791     char *line = strstr(_line,"biotype=");
792     if ( !line ) return -1;
793 
794     line += 8;
795     switch (*line)
796     {
797         case 'p':
798             if ( !strncmp(line,"protein_coding",14) ) return GF_PROTEIN_CODING;
799             else if ( !strncmp(line,"pseudogene",10) ) return GF_PSEUDOGENE;
800             else if ( !strncmp(line,"processed_transcript",20) ) return GF_PROCESSED_TRANSCRIPT;
801             else if ( !strncmp(line,"processed_pseudogene",20) ) return GF_PROCESSED_PSEUDOGENE;
802             else if ( !strncmp(line,"polymorphic_pseudogene",22) ) return GF_POLYMORPHIC_PSEUDOGENE;
803             break;
804         case 'a':
805             if ( !strncmp(line,"artifact",8) ) return GF_ARTIFACT;
806             else if ( !strncmp(line,"antisense",9) ) return GF_ANTISENSE;
807             else if ( !strncmp(line,"ambiguous_orf",13) ) return GF_AMBIGUOUS_ORF;
808             break;
809         case 'I':
810             if ( !strncmp(line,"IG_C_gene",9) ) return GF_IG_C;
811             else if ( !strncmp(line,"IG_D_gene",9) ) return GF_IG_D;
812             else if ( !strncmp(line,"IG_J_gene",9) ) return GF_IG_J;
813             else if ( !strncmp(line,"IG_LV_gene",10) ) return GF_IG_LV;
814             else if ( !strncmp(line,"IG_V_gene",9) ) return GF_IG_V;
815             else if ( !strncmp(line,"IG_pseudogene",13) ) return GF_IG_PSEUDOGENE;
816             else if ( !strncmp(line,"IG_C_pseudogene",15) ) return GF_IG_C_PSEUDOGENE;
817             else if ( !strncmp(line,"IG_J_pseudogene",15) ) return GF_IG_J_PSEUDOGENE;
818             else if ( !strncmp(line,"IG_V_pseudogene",15) ) return GF_IG_V_PSEUDOGENE;
819             break;
820         case 'T':
821             if ( !strncmp(line,"TR_C_gene",9) ) return GF_TR_C;
822             else if ( !strncmp(line,"TR_D_gene",9) ) return GF_TR_D;
823             else if ( !strncmp(line,"TR_J_gene",9) ) return GF_TR_J;
824             else if ( !strncmp(line,"TR_V_gene",9) ) return GF_TR_V;
825             else if ( !strncmp(line,"TR_V_pseudogene",15) ) return GF_TR_V_PSEUDOGENE;
826             else if ( !strncmp(line,"TR_J_pseudogene",15) ) return GF_TR_J_PSEUDOGENE;
827             break;
828         case 'M':
829             if ( !strncmp(line,"Mt_tRNA_pseudogene",18) ) return GF_MT_tRNA_PSEUDOGENE;
830             else if ( !strncmp(line,"Mt_tRNA",7) ) return GF_MT_tRNA;
831             else if ( !strncmp(line,"Mt_rRNA",7) ) return GF_MT_tRNA;
832             break;
833         case 'l':
834             if ( !strncmp(line,"lincRNA",7) ) return GF_lincRNA;
835             break;
836         case 'm':
837             if ( !strncmp(line,"macro_lncRNA",12) ) return GF_macro_lncRNA;
838             else if ( !strncmp(line,"misc_RNA_pseudogene",19) ) return GF_misc_RNA_PSEUDOGENE;
839             else if ( !strncmp(line,"miRNA_pseudogene",16) ) return GF_miRNA_PSEUDOGENE;
840             else if ( !strncmp(line,"miRNA",5) ) return GF_miRNA;
841             else if ( !strncmp(line,"misc_RNA",8) ) return GF_MISC_RNA;
842             break;
843         case 'r':
844             if ( !strncmp(line,"rRNA",4) ) return GF_rRNA;
845             else if ( !strncmp(line,"ribozyme",8) ) return GF_RIBOZYME;
846             else if ( !strncmp(line,"retained_intron",15) ) return GF_RETAINED_INTRON;
847             else if ( !strncmp(line,"retrotransposed",15) ) return GF_RETROTRANSPOSED;
848             break;
849         case 's':
850             if ( !strncmp(line,"snRNA",5) ) return GF_snRNA;
851             else if ( !strncmp(line,"sRNA",4) ) return GF_sRNA;
852             else if ( !strncmp(line,"scRNA",5) ) return GF_scRNA;
853             else if ( !strncmp(line,"scaRNA",6) ) return GF_scaRNA;
854             else if ( !strncmp(line,"snoRNA",6) ) return GF_snoRNA;
855             else if ( !strncmp(line,"sense_intronic",14) ) return GF_SENSE_INTRONIC;
856             else if ( !strncmp(line,"sense_overlapping",17) ) return GF_SENSE_OVERLAPPING;
857             break;
858         case 't':
859             if ( !strncmp(line,"tRNA_pseudogene",15) ) return GF_tRNA_PSEUDOGENE;
860             else if ( !strncmp(line,"transcribed_processed_pseudogene",32) ) return GF_TRANSCRIBED_PROCESSED_PSEUDOGENE;
861             else if ( !strncmp(line,"transcribed_unprocessed_pseudogene",34) ) return GF_TRANSCRIBED_UNPROCESSED_PSEUDOGENE;
862             else if ( !strncmp(line,"transcribed_unitary_pseudogene",30) ) return GF_TRANSCRIBED_UNITARY_PSEUDOGENE;
863             else if ( !strncmp(line,"translated_unprocessed_pseudogene",33) ) return GF_TRANSLATED_UNPROCESSED_PSEUDOGENE;
864             else if ( !strncmp(line,"translated_processed_pseudogene",31) ) return GF_TRANSLATED_PROCESSED_PSEUDOGENE;
865             break;
866         case 'n':
867             if ( !strncmp(line,"nonsense_mediated_decay",23) ) return GF_NMD;
868             else if ( !strncmp(line,"non_stop_decay",14) ) return GF_NON_STOP_DECAY;
869             break;
870         case 'k':
871             if ( !strncmp(line,"known_ncrna",11) ) return GF_KNOWN_NCRNA;
872             break;
873         case 'u':
874             if ( !strncmp(line,"unitary_pseudogene",18) ) return GF_UNITARY_PSEUDOGENE;
875             else if ( !strncmp(line,"unprocessed_pseudogene",22) ) return GF_UNPROCESSED_PSEUDOGENE;
876             break;
877         case 'L':
878             if ( !strncmp(line,"LRG_gene",8) ) return GF_LRG_GENE;
879             break;
880         case '3':
881             if ( !strncmp(line,"3prime_overlapping_ncRNA",24) ) return GF_3PRIME_OVERLAPPING_ncRNA;
882             break;
883         case 'd':
884             if ( !strncmp(line,"disrupted_domain",16) ) return GF_DISRUPTED_DOMAIN;
885             break;
886         case 'v':
887             if ( !strncmp(line,"vaultRNA",8) ) return GF_vaultRNA;
888             break;
889         case 'b':
890             if ( !strncmp(line,"bidirectional_promoter_lncRNA",29) ) return GF_BIDIRECTIONAL_PROMOTER_lncRNA;
891             break;
892     }
893     return 0;
894 }
gff_ignored_biotype(args_t * args,char * ss)895 static inline int gff_ignored_biotype(args_t *args, char *ss)
896 {
897     ss = strstr(ss,"biotype=");
898     if ( !ss ) return 0;
899 
900     ss += 8;
901     char *se = ss, tmp;
902     while ( *se && *se!=';' ) se++;
903     tmp = *se;
904     *se = 0;
905 
906     char *key = ss;
907     int n = 0;
908     if ( khash_str2int_get(args->init.ignored_biotypes, ss, &n)!=0 ) key = strdup(ss);
909     khash_str2int_set(args->init.ignored_biotypes, key, n+1);
910 
911     *se = tmp;
912     return 1;
913 }
gene_init(aux_t * aux,uint32_t gene_id)914 gf_gene_t *gene_init(aux_t *aux, uint32_t gene_id)
915 {
916     khint_t k = kh_get(int2gene, aux->gid2gene, (int)gene_id);
917     gf_gene_t *gene = (k == kh_end(aux->gid2gene)) ? NULL : kh_val(aux->gid2gene, k);
918     if ( !gene )
919     {
920         gene = (gf_gene_t*) calloc(1,sizeof(gf_gene_t));
921         int ret;
922         k = kh_put(int2gene, aux->gid2gene, (int)gene_id, &ret);
923         kh_val(aux->gid2gene,k) = gene;
924     }
925     return gene;
926 }
gff_parse_transcript(args_t * args,const char * line,char * ss,ftr_t * ftr)927 void gff_parse_transcript(args_t *args, const char *line, char *ss, ftr_t *ftr)
928 {
929     aux_t *aux = &args->init;
930     int biotype = gff_parse_biotype(ss);
931     if ( biotype <= 0 )
932     {
933         if ( !gff_ignored_biotype(args, ss) && args->verbosity > 0 ) fprintf(stderr,"ignored transcript: %s\n",line);
934         return;
935     }
936 
937     // create a mapping from transcript_id to gene_id
938     uint32_t trid = gff_id_parse(&args->tscript_ids, line, "ID=transcript:", ss);
939     uint32_t gene_id = gff_id_parse(&args->init.gene_ids, line, "Parent=gene:", ss);
940 
941     tscript_t *tr = (tscript_t*) calloc(1,sizeof(tscript_t));
942     tr->id     = trid;
943     tr->strand = ftr->strand;
944     tr->gene   = gene_init(aux, gene_id);
945     tr->type   = biotype;
946     tr->beg    = ftr->beg;
947     tr->end    = ftr->end;
948 
949     khint_t k;
950     int ret;
951     k = kh_put(int2tscript, aux->id2tr, (int)trid, &ret);
952     kh_val(aux->id2tr,k) = tr;
953 }
gff_parse_gene(args_t * args,const char * line,char * ss,char * chr_beg,char * chr_end,ftr_t * ftr)954 void gff_parse_gene(args_t *args, const char *line, char *ss, char *chr_beg, char *chr_end, ftr_t *ftr)
955 {
956     int biotype = gff_parse_biotype(ss);
957     if ( biotype <= 0 )
958     {
959         if ( !gff_ignored_biotype(args, ss) && args->verbosity > 0 ) fprintf(stderr,"ignored gene: %s\n",line);
960         return;
961     }
962 
963     aux_t *aux = &args->init;
964 
965     // substring search for "ID=gene:ENSG00000437963"
966     uint32_t gene_id = gff_id_parse(&aux->gene_ids, line, "ID=gene:", ss);
967     gf_gene_t *gene = gene_init(aux, gene_id);
968     assert( !gene->name );      // the gene_id should be unique
969 
970     gene->iseq = feature_set_seq(args, chr_beg,chr_end);
971 
972     // substring search for "Name=OR4F5"
973     ss = strstr(chr_end+2,"Name=");
974     if ( ss )
975     {
976         ss += 5;
977         char *se = ss;
978         while ( *se && *se!=';' && !isspace(*se) ) se++;
979         gene->name = (char*) malloc(se-ss+1);
980         memcpy(gene->name,ss,se-ss);
981         gene->name[se-ss] = 0;
982     }
983     else
984         gene->name = strdup(aux->gene_ids.str[gene_id]); // Name=<GeneName> field is not present, use the gene ID instead
985 }
gff_parse(args_t * args,char * line,ftr_t * ftr)986 int gff_parse(args_t *args, char *line, ftr_t *ftr)
987 {
988     // - skip empty lines and commented lines
989     // - columns
990     //      1.      chr
991     //      2.      <skip>
992     //      3.      CDS, transcript, gene, ...
993     //      4-5.    beg,end
994     //      6.      <skip>
995     //      7.      strand
996     //      8.      phase
997     //      9.      Parent=transcript:ENST(\d+);ID=... etc
998 
999     char *ss = line;
1000     if ( !*ss ) return -1;      // skip blank lines
1001     if ( *ss=='#' ) return -1;  // skip comments
1002 
1003     char *chr_beg, *chr_end;
1004     gff_parse_chr(line, &chr_beg, &chr_end);
1005     ss = gff_skip(line, chr_end + 2);
1006 
1007     // 3. column: is this a CDS, transcript, gene, etc.
1008     if ( !strncmp("exon\t",ss,5) ) { ftr->type = GF_EXON; ss += 5; }
1009     else if ( !strncmp("CDS\t",ss,4) ) { ftr->type = GF_CDS; ss += 4; }
1010     else if ( !strncmp("three_prime_UTR\t",ss,16) ) { ftr->type = GF_UTR3; ss += 16; }
1011     else if ( !strncmp("five_prime_UTR\t",ss,15) ) { ftr->type = GF_UTR5; ss += 15; }
1012     else
1013     {
1014         ss = gff_skip(line, ss);
1015         ss = gff_parse_beg_end(line, ss, &ftr->beg,&ftr->end);
1016         ss = gff_skip(line, ss);
1017         int type = gff_parse_type(ss);
1018         if ( type!=GFF_TSCRIPT_LINE && type!=GFF_GENE_LINE )
1019         {
1020             // we ignore these, debug print to see new types:
1021             ss = strstr(ss,"ID=");
1022             if ( !ss ) return -1;   // no ID, ignore the line
1023             if ( !strncmp("chromosome",ss+3,10) ) return -1;
1024             if ( !strncmp("supercontig",ss+3,11) ) return -1;
1025             if ( args->verbosity > 0 ) fprintf(stderr,"ignored: %s\n", line);
1026             return -1;
1027         }
1028 
1029         // 7. column: strand
1030         if ( *ss == '+' ) ftr->strand = STRAND_FWD;
1031         else if ( *ss == '-' ) ftr->strand = STRAND_REV;
1032         else error("Unknown strand: %c .. %s\n", *ss,ss);
1033 
1034         if ( type==GFF_TSCRIPT_LINE )
1035             gff_parse_transcript(args, line, ss, ftr);
1036         else
1037             gff_parse_gene(args, line, ss, chr_beg, chr_end, ftr);
1038 
1039         return -1;
1040     }
1041     ss = gff_parse_beg_end(line, ss, &ftr->beg,&ftr->end);
1042     ss = gff_skip(line, ss);
1043 
1044     // 7. column: strand
1045     if ( *ss == '+' ) ftr->strand = STRAND_FWD;
1046     else if ( *ss == '-' ) ftr->strand = STRAND_REV;
1047     else { if ( args->verbosity > 0 ) fprintf(stderr,"Skipping unknown strand: %c\n", *ss); return -1; }
1048     ss += 2;
1049 
1050     // 8. column: phase (codon offset)
1051     if ( *ss == '0' ) ftr->phase = 0;
1052     else if ( *ss == '1' ) ftr->phase = 1;
1053     else if ( *ss == '2' ) ftr->phase = 2;
1054     else if ( *ss == '.' ) ftr->phase = 0;      // exons do not have phase
1055     else { if ( args->verbosity > 0 ) fprintf(stderr,"Skipping unknown phase: %c, %s\n", *ss, line); return -1; }
1056     ss += 2;
1057 
1058     // substring search for "Parent=transcript:ENST00000437963"
1059     ftr->trid = gff_id_parse(&args->tscript_ids, line, "Parent=transcript:", ss);
1060     ftr->iseq = feature_set_seq(args, chr_beg,chr_end);
1061     return 0;
1062 }
1063 
cmp_cds_ptr(const void * a,const void * b)1064 static int cmp_cds_ptr(const void *a, const void *b)
1065 {
1066     // comparison function for qsort of transcripts's CDS
1067     if ( (*((gf_cds_t**)a))->beg < (*((gf_cds_t**)b))->beg ) return -1;
1068     if ( (*((gf_cds_t**)a))->beg > (*((gf_cds_t**)b))->beg ) return 1;
1069     return 0;
1070 }
1071 
chr_beg_end(aux_t * aux,int iseq,char ** chr_beg,char ** chr_end)1072 static inline void chr_beg_end(aux_t *aux, int iseq, char **chr_beg, char **chr_end)
1073 {
1074     *chr_beg = *chr_end = aux->seq[iseq];
1075     while ( (*chr_end)[1] ) (*chr_end)++;
1076 }
tscript_init(aux_t * aux,uint32_t trid)1077 tscript_t *tscript_init(aux_t *aux, uint32_t trid)
1078 {
1079     khint_t k = kh_get(int2tscript, aux->id2tr, (int)trid);
1080     tscript_t *tr = (k == kh_end(aux->id2tr)) ? NULL : kh_val(aux->id2tr, k);
1081     assert( tr );
1082     return tr;
1083 }
register_cds(args_t * args,ftr_t * ftr)1084 void register_cds(args_t *args, ftr_t *ftr)
1085 {
1086     // Make the CDS searchable via idx_cds. Note we do not malloc tr->cds just yet.
1087     //  ftr is the result of parsing a gff CDS line
1088     aux_t *aux = &args->init;
1089 
1090     tscript_t *tr = tscript_init(aux, ftr->trid);
1091     if ( tr->strand != ftr->strand ) error("Conflicting strand in transcript %"PRIu32" .. %d vs %d\n",ftr->trid,tr->strand,ftr->strand);
1092 
1093     gf_cds_t *cds = (gf_cds_t*) malloc(sizeof(gf_cds_t));
1094     cds->tr    = tr;
1095     cds->beg   = ftr->beg;
1096     cds->len   = ftr->end - ftr->beg + 1;
1097     cds->icds  = 0;     // to keep valgrind on mac happy
1098     cds->phase = ftr->phase;
1099 
1100     hts_expand(gf_cds_t*,tr->ncds+1,tr->mcds,tr->cds);
1101     tr->cds[tr->ncds++] = cds;
1102 }
register_utr(args_t * args,ftr_t * ftr)1103 void register_utr(args_t *args, ftr_t *ftr)
1104 {
1105     aux_t *aux = &args->init;
1106     gf_utr_t *utr = (gf_utr_t*) malloc(sizeof(gf_utr_t));
1107     utr->which = ftr->type==GF_UTR3 ? prime3 : prime5;
1108     utr->beg   = ftr->beg;
1109     utr->end   = ftr->end;
1110     utr->tr    = tscript_init(aux, ftr->trid);
1111 
1112     char *chr_beg, *chr_end;
1113     chr_beg_end(&args->init, utr->tr->gene->iseq, &chr_beg, &chr_end);
1114     regidx_push(args->idx_utr, chr_beg,chr_end, utr->beg,utr->end, &utr);
1115 }
register_exon(args_t * args,ftr_t * ftr)1116 void register_exon(args_t *args, ftr_t *ftr)
1117 {
1118     aux_t *aux = &args->init;
1119     gf_exon_t *exon = (gf_exon_t*) malloc(sizeof(gf_exon_t));
1120     exon->beg = ftr->beg;
1121     exon->end = ftr->end;
1122     exon->tr  = tscript_init(aux, ftr->trid);
1123 
1124     char *chr_beg, *chr_end;
1125     chr_beg_end(&args->init, exon->tr->gene->iseq, &chr_beg, &chr_end);
1126     regidx_push(args->idx_exon, chr_beg,chr_end, exon->beg - N_SPLICE_REGION_INTRON, exon->end + N_SPLICE_REGION_INTRON, &exon);
1127 }
1128 
tscript_init_cds(args_t * args)1129 void tscript_init_cds(args_t *args)
1130 {
1131     aux_t *aux = &args->init;
1132 
1133     // Sort CDS in all transcripts, set offsets, check their phase, length, create index (idx_cds)
1134     khint_t k;
1135     for (k=0; k<kh_end(aux->id2tr); k++)
1136     {
1137         if ( !kh_exist(aux->id2tr, k) ) continue;
1138         tscript_t *tr = (tscript_t*) kh_val(aux->id2tr, k);
1139 
1140         // position-to-tscript lookup
1141         char *chr_beg, *chr_end;
1142         chr_beg_end(aux, tr->gene->iseq, &chr_beg, &chr_end);
1143         regidx_push(args->idx_tscript, chr_beg, chr_end, tr->beg, tr->end, &tr);
1144 
1145         if ( !tr->ncds ) continue;      // transcript with no CDS
1146 
1147         // sort CDs
1148         qsort(tr->cds, tr->ncds, sizeof(gf_cds_t*), cmp_cds_ptr);
1149 
1150         // trim non-coding start
1151         int i, len = 0;
1152         if ( tr->strand==STRAND_FWD )
1153         {
1154             if ( tr->cds[0]->phase ) tr->trim |= TRIM_5PRIME;
1155             tr->cds[0]->beg += tr->cds[0]->phase;
1156             tr->cds[0]->len -= tr->cds[0]->phase;
1157             tr->cds[0]->phase = 0;
1158 
1159             // sanity check phase; the phase number in gff tells us how many bases to skip in this
1160             // feature to reach the first base of the next codon
1161             int tscript_ok = 1;
1162             for (i=0; i<tr->ncds; i++)
1163             {
1164                 int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0;
1165                 if ( phase!=len%3)
1166                 {
1167                     if ( args->force )
1168                     {
1169                         if ( args->verbosity > 0 )
1170                             fprintf(stderr,"Warning: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
1171                         tscript_ok = 0;
1172                         break;
1173                     }
1174                     error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d). Use the --force option to proceed anyway (at your own risk).\n",
1175                         args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
1176                 }
1177                 len += tr->cds[i]->len;
1178             }
1179             if ( !tscript_ok ) continue;    // skip this transcript
1180         }
1181         else
1182         {
1183             // Check that the phase is not bigger than CDS length. Curiously, this can really happen,
1184             // see Mus_musculus.GRCm38.85.gff3.gz, transcript:ENSMUST00000163141
1185             // todo: the same for the fwd strand
1186             i = tr->ncds - 1;
1187             int phase = tr->cds[i]->phase;
1188             if ( phase ) tr->trim |= TRIM_5PRIME;
1189             while ( i>=0 && phase > tr->cds[i]->len )
1190             {
1191                 phase -= tr->cds[i]->len;
1192                 tr->cds[i]->phase = 0;
1193                 tr->cds[i]->len   = 0;
1194                 i--;
1195             }
1196             tr->cds[i]->len  -= tr->cds[i]->phase;
1197             tr->cds[i]->phase = 0;
1198 
1199             // sanity check phase
1200             int tscript_ok = 1;
1201             for (i=tr->ncds-1; i>=0; i--)
1202             {
1203                 int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0;
1204                 if ( phase!=len%3)
1205                 {
1206                     if ( args->force )
1207                     {
1208                         if ( args->verbosity > 0 )
1209                             fprintf(stderr,"Warning: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
1210                         tscript_ok = 0;
1211                         break;
1212                     }
1213                     error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d). Use the --force option to proceed anyway (at your own risk).\n",
1214                         args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
1215                 }
1216                 len += tr->cds[i]->len;
1217             }
1218             if ( !tscript_ok ) continue;    // skip this transcript
1219         }
1220 
1221         // set len. At the same check that CDS within a transcript do not overlap
1222         len = 0;
1223         for (i=0; i<tr->ncds; i++)
1224         {
1225             tr->cds[i]->icds = i;
1226             len += tr->cds[i]->len;
1227             if ( !i ) continue;
1228 
1229             gf_cds_t *a = tr->cds[i-1];
1230             gf_cds_t *b = tr->cds[i];
1231             if ( a->beg + a->len - 1 >= b->beg )
1232             {
1233                 if ( args->force )
1234                 {
1235                     fprintf(stderr,"Warning: GFF contains overlapping CDS %s: %"PRIu32"-%"PRIu32" and %"PRIu32"-%"PRIu32".\n",
1236                         args->tscript_ids.str[tr->id], a->beg+1,a->beg+a->len, b->beg+1,b->beg+b->len);
1237                 }
1238                 else
1239                     error("Error: CDS overlap in the transcript %s: %"PRIu32"-%"PRIu32" and %"PRIu32"-%"PRIu32", is this intended (e.g. ribosomal slippage)?\n"
1240                           "       Use the --force option to override (at your own risk).\n",
1241                             args->tscript_ids.str[tr->id], a->beg+1,a->beg+a->len, b->beg+1,b->beg+b->len);
1242             }
1243         }
1244         if ( len%3 != 0 )
1245         {
1246             // There are 13k transcripts with incomplete 3' CDS. See for example ENST00000524289
1247             //  http://sep2015.archive.ensembl.org/Homo_sapiens/Transcript/Sequence_cDNA?db=core;g=ENSG00000155868;r=5:157138846-157159019;t=ENST00000524289
1248             // Also, the incomplete CDS can be too short (1 or 2bp), so it is not enough to trim the last one.
1249 
1250             tr->trim |= TRIM_3PRIME;
1251             if ( tr->strand==STRAND_FWD )
1252             {
1253                 i = tr->ncds - 1;
1254                 while ( i>=0 && len%3 )
1255                 {
1256                     int dlen = tr->cds[i]->len >= len%3 ? len%3 : tr->cds[i]->len;
1257                     tr->cds[i]->len -= dlen;
1258                     len -= dlen;
1259                     i--;
1260                 }
1261             }
1262             else
1263             {
1264                 i = 0;
1265                 while ( i<tr->ncds && len%3 )
1266                 {
1267                     int dlen = tr->cds[i]->len >= len%3 ? len%3 : tr->cds[i]->len;
1268                     tr->cds[i]->len -= dlen;
1269                     tr->cds[i]->beg += dlen;
1270                     len -= dlen;
1271                     i++;
1272                 }
1273             }
1274         }
1275 
1276         // set CDS offsets and insert into regidx
1277         len=0;
1278         for (i=0; i<tr->ncds; i++)
1279         {
1280             tr->cds[i]->pos = len;
1281             len += tr->cds[i]->len;
1282             regidx_push(args->idx_cds, chr_beg,chr_end, tr->cds[i]->beg,tr->cds[i]->beg+tr->cds[i]->len-1, &tr->cds[i]);
1283         }
1284     }
1285 }
1286 
regidx_free_gf(void * payload)1287 void regidx_free_gf(void *payload) { free(*((gf_cds_t**)payload)); }
regidx_free_tscript(void * payload)1288 void regidx_free_tscript(void *payload) { tscript_t *tr = *((tscript_t**)payload); free(tr->cds); free(tr); }
1289 
init_gff(args_t * args)1290 void init_gff(args_t *args)
1291 {
1292     aux_t *aux = &args->init;
1293     aux->seq2int   = khash_str2int_init();   // chrom's numeric id
1294     aux->gid2gene  = kh_init(int2gene);      // gene id to gf_gene_t, for idx_gene
1295     aux->id2tr     = kh_init(int2tscript);   // transcript id to tscript_t
1296     args->idx_tscript = regidx_init(NULL, NULL, regidx_free_tscript, sizeof(tscript_t*), NULL);
1297     aux->ignored_biotypes = khash_str2int_init();
1298     gff_id_init(&aux->gene_ids);
1299     gff_id_init(&args->tscript_ids);
1300 
1301     // parse gff
1302     kstring_t str = {0,0,0};
1303     htsFile *fp = hts_open(args->gff_fname,"r");
1304     if ( !fp ) error("Failed to read %s\n", args->gff_fname);
1305     while ( hts_getline(fp, KS_SEP_LINE, &str) > 0 )
1306     {
1307         hts_expand(ftr_t, aux->nftr+1, aux->mftr, aux->ftr);
1308         int ret = gff_parse(args, str.s, aux->ftr + aux->nftr);
1309         if ( !ret ) aux->nftr++;
1310     }
1311     free(str.s);
1312     if ( hts_close(fp)!=0 ) error("Close failed: %s\n", args->gff_fname);
1313 
1314 
1315     // process gff information: connect CDS and exons to transcripts
1316     args->idx_cds  = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_cds_t*), NULL);
1317     args->idx_utr  = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_utr_t*), NULL);
1318     args->idx_exon = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_exon_t*), NULL);
1319     args->itr      = regitr_init(NULL);
1320 
1321     int i;
1322     for (i=0; i<aux->nftr; i++)
1323     {
1324         ftr_t *ftr = &aux->ftr[i];
1325 
1326         // check whether to keep this feature: is there a mapping trid -> gene_id -> gene?
1327         khint_t k = kh_get(int2tscript, aux->id2tr, (int)ftr->trid);
1328         if ( k==kh_end(aux->id2tr) ) continue;       // no such transcript
1329 
1330         tscript_t *tr = kh_val(aux->id2tr,k);
1331         if ( !tr->gene->name )
1332         {
1333             // not a supported biotype (e.g. gene:pseudogene, transcript:processed_transcript)
1334             regidx_free_tscript(&tr);
1335             kh_del(int2tscript, aux->id2tr,k);
1336             continue;
1337         }
1338 
1339         // populate regidx by category:
1340         //      ftr->type   .. GF_CDS, GF_EXON, GF_UTR3, GF_UTR5
1341         //      gene->type  .. GF_PROTEIN_CODING, GF_MT_rRNA, GF_IG_C, ...
1342         if ( ftr->type==GF_CDS ) register_cds(args, ftr);
1343         else if ( ftr->type==GF_EXON ) register_exon(args, ftr);
1344         else if ( ftr->type==GF_UTR5 ) register_utr(args, ftr);
1345         else if ( ftr->type==GF_UTR3 ) register_utr(args, ftr);
1346         else
1347             error("something: %s\t%d\t%d\t%s\t%s\n", aux->seq[ftr->iseq],ftr->beg+1,ftr->end+1,args->tscript_ids.str[ftr->trid],gf_type2gff_string(ftr->type));
1348     }
1349     tscript_init_cds(args);
1350 
1351     if ( args->verbosity > 0 )
1352     {
1353         fprintf(stderr,"Indexed %d transcripts, %d exons, %d CDSs, %d UTRs\n",
1354                 regidx_nregs(args->idx_tscript),
1355                 regidx_nregs(args->idx_exon),
1356                 regidx_nregs(args->idx_cds),
1357                 regidx_nregs(args->idx_utr));
1358     }
1359 
1360     free(aux->ftr);
1361     khash_str2int_destroy_free(aux->seq2int);
1362     // keeping only to destroy the genes at the end: kh_destroy(int2gene,aux->gid2gene);
1363     kh_destroy(int2tscript,aux->id2tr);
1364     free(aux->seq);
1365     gff_id_destroy(&aux->gene_ids);
1366 
1367     if ( args->verbosity > 0 && khash_str2int_size(aux->ignored_biotypes) )
1368     {
1369         khash_t(str2int) *ign = (khash_t(str2int)*)aux->ignored_biotypes;
1370         fprintf(stderr,"Ignored the following biotypes:\n");
1371         for (i = kh_begin(ign); i < kh_end(ign); i++)
1372         {
1373             if ( !kh_exist(ign,i)) continue;
1374             const char *biotype = kh_key(ign,i);
1375             if ( !strcmp(biotype,"TCE") ) biotype = "TCE (\"To be Experimentally Confirmed\")";
1376             fprintf(stderr,"\t%dx\t.. %s\n", kh_value(ign,i), biotype);
1377         }
1378     }
1379     khash_str2int_destroy_free(aux->ignored_biotypes);
1380 }
1381 
ncsq2_to_nfmt(int ncsq2)1382 static inline int ncsq2_to_nfmt(int ncsq2)
1383 {
1384     return 1 + (ncsq2 - 1) / 30;
1385 }
icsq2_to_bit(int icsq2,int * ival,int * ibit)1386 static inline void icsq2_to_bit(int icsq2, int *ival, int *ibit)
1387 {
1388     *ival = icsq2 / 30;
1389     *ibit = icsq2 % 30;
1390 }
1391 
init_data(args_t * args)1392 void init_data(args_t *args)
1393 {
1394     args->nfmt_bcsq = ncsq2_to_nfmt(args->ncsq2_max);
1395 
1396     args->fai = fai_load(args->fa_fname);
1397     if ( !args->fai ) error("Failed to load the fai index: %s\n", args->fa_fname);
1398 
1399     if ( args->verbosity > 0 ) fprintf(stderr,"Parsing %s ...\n", args->gff_fname);
1400     init_gff(args);
1401 
1402     args->rid = -1;
1403 
1404     if ( args->filter_str )
1405         args->filter = filter_init(args->hdr, args->filter_str);
1406 
1407     args->pos2vbuf  = kh_init(pos2vbuf);
1408     args->active_tr = khp_init(trhp);
1409     args->hap = (hap_t*) calloc(1,sizeof(hap_t));
1410 
1411     // init samples
1412     if ( !bcf_hdr_nsamples(args->hdr) ) args->phase = PHASE_DROP_GT;
1413     if ( args->sample_list && !strcmp("-",args->sample_list) )
1414     {
1415         // ignore all samples
1416         if ( args->output_type==FT_TAB_TEXT )
1417         {
1418             // significant speedup for plain VCFs
1419             if (bcf_hdr_set_samples(args->hdr,NULL,0) < 0)
1420                 error_errno("[%s] Couldn't build sample filter", __func__);
1421         }
1422         args->phase = PHASE_DROP_GT;
1423     }
1424     else
1425         args->smpl = smpl_ilist_init(args->hdr, args->sample_list, args->sample_is_file, SMPL_STRICT);
1426     args->hdr_nsmpl = args->phase==PHASE_DROP_GT ? 0 : bcf_hdr_nsamples(args->hdr);
1427 
1428     if ( args->output_type==FT_TAB_TEXT )
1429     {
1430         args->out = args->output_fname ? fopen(args->output_fname,"w") : stdout;
1431         if ( !args->out ) error("Failed to write to %s: %s\n", !strcmp("-",args->output_fname)?"standard output":args->output_fname,strerror(errno));
1432 
1433         fprintf(args->out,"# This file was produced by: bcftools +csq(%s+htslib-%s)\n", bcftools_version(),hts_version());
1434         fprintf(args->out,"# The command line was:\tbcftools +%s", args->argv[0]);
1435         int i;
1436         for (i=1; i<args->argc; i++)
1437             fprintf(args->out," %s",args->argv[i]);
1438         fprintf(args->out,"\n");
1439         fprintf(args->out,"# LOG\t[2]Message\n");
1440         fprintf(args->out,"# CSQ"); i = 1;
1441         fprintf(args->out,"\t[%d]Sample", ++i);
1442         fprintf(args->out,"\t[%d]Haplotype", ++i);
1443         fprintf(args->out,"\t[%d]Chromosome", ++i);
1444         fprintf(args->out,"\t[%d]Position", ++i);
1445         fprintf(args->out,"\t[%d]Consequence", ++i);
1446         fprintf(args->out,"\n");
1447     }
1448     else
1449     {
1450         char wmode[8];
1451         set_wmode(wmode,args->output_type,args->output_fname,args->clevel);
1452         args->out_fh = hts_open(args->output_fname ? args->output_fname : "-", wmode);
1453         if ( args->out_fh == NULL ) error("[%s] Error: cannot write to %s: %s\n", __func__,args->output_fname? args->output_fname : "standard output", strerror(errno));
1454         if ( args->n_threads > 0)
1455             hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->sr->p);
1456         if ( args->record_cmd_line ) bcf_hdr_append_version(args->hdr,args->argc,args->argv,"bcftools/csq");
1457         bcf_hdr_printf(args->hdr,"##INFO=<ID=%s,Number=.,Type=String,Description=\"%s consequence annotation from BCFtools/csq, see http://samtools.github.io/bcftools/howtos/csq-calling.html for details. Format: Consequence|gene|transcript|biotype|strand|amino_acid_change|dna_change\">",args->bcsq_tag, args->local_csq ? "Local" : "Haplotype-aware");
1458         if ( args->hdr_nsmpl )
1459             bcf_hdr_printf(args->hdr,"##FORMAT=<ID=%s,Number=.,Type=Integer,Description=\"Bitmask of indexes to INFO/BCSQ, with interleaved first/second haplotype. Use \\\"bcftools query -f'[%%CHROM\\t%%POS\\t%%SAMPLE\\t%%TBCSQ\\n]'\\\" to translate.\">",args->bcsq_tag);
1460         if ( bcf_hdr_write(args->out_fh, args->hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname?args->output_fname:"standard output");
1461     }
1462     if ( args->verbosity > 0 ) fprintf(stderr,"Calling...\n");
1463 }
1464 
destroy_data(args_t * args)1465 void destroy_data(args_t *args)
1466 {
1467     if ( args->ncsq2_small_warned )
1468         fprintf(stderr,
1469             "Note: Some samples had too many consequences to be represented in %d bytes. If you need to record them all,\n"
1470             "      the limit can be increased by running with `--ncsq %d`.\n",ncsq2_to_nfmt(args->ncsq2_max)/8,1+args->ncsq2_small_warned/2);
1471 
1472     regidx_destroy(args->idx_cds);
1473     regidx_destroy(args->idx_utr);
1474     regidx_destroy(args->idx_exon);
1475     regidx_destroy(args->idx_tscript);
1476     regitr_destroy(args->itr);
1477 
1478     khint_t k,i,j;
1479     for (k=0; k<kh_end(args->init.gid2gene); k++)
1480     {
1481         if ( !kh_exist(args->init.gid2gene, k) ) continue;
1482         gf_gene_t *gene = (gf_gene_t*) kh_val(args->init.gid2gene, k);
1483         free(gene->name);
1484         free(gene);
1485     }
1486     kh_destroy(int2gene,args->init.gid2gene);
1487 
1488     if ( args->filter )
1489         filter_destroy(args->filter);
1490 
1491     khp_destroy(trhp,args->active_tr);
1492     kh_destroy(pos2vbuf,args->pos2vbuf);
1493     if ( args->smpl ) smpl_ilist_destroy(args->smpl);
1494     int ret;
1495     if ( args->out_fh )
1496         ret = hts_close(args->out_fh);
1497     else
1498         ret = fclose(args->out);
1499     if ( ret ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout");
1500     for (i=0; i<args->vcf_rbuf.m; i++)
1501     {
1502         vbuf_t *vbuf = args->vcf_buf[i];
1503         if ( !vbuf ) continue;
1504         for (j=0; j<vbuf->m; j++)
1505         {
1506             if ( !vbuf->vrec[j] ) continue;
1507             if ( vbuf->vrec[j]->line ) bcf_destroy(vbuf->vrec[j]->line);
1508             free(vbuf->vrec[j]->fmt_bm);
1509             free(vbuf->vrec[j]->vcsq);
1510             free(vbuf->vrec[j]);
1511         }
1512         free(vbuf->vrec);
1513         free(vbuf);
1514     }
1515     free(args->vcf_buf);
1516     free(args->rm_tr);
1517     free(args->csq_buf);
1518     free(args->hap->stack);
1519     free(args->hap->sseq.s);
1520     free(args->hap->tseq.s);
1521     free(args->hap->tref.s);
1522     free(args->hap);
1523     fai_destroy(args->fai);
1524     free(args->gt_arr);
1525     free(args->str.s);
1526     free(args->str2.s);
1527     gff_id_destroy(&args->tscript_ids);
1528 }
1529 
1530 /*
1531     The splice_* functions are for consquences around splice sites: start,stop,splice_*
1532  */
1533 #define SPLICE_VAR_REF 0   // ref: ACGT>ACGT, csq not applicable, skip completely
1534 #define SPLICE_OUTSIDE 1   // splice acceptor or similar; csq set and is done, does not overlap the region
1535 #define SPLICE_INSIDE  2   // overlaps coding region; csq can be set but coding prediction is needed
1536 #define SPLICE_OVERLAP 3   // indel overlaps region boundary, csq set but could not determine csq
1537 typedef struct
1538 {
1539     tscript_t *tr;
1540     struct {
1541         int32_t pos, rlen, alen, ial;
1542         char *ref, *alt;
1543         bcf1_t *rec;
1544     } vcf;
1545     uint16_t check_acceptor:1,  // check distance from exon start (fwd) or end (rev)
1546              check_start:1,     // this is the first coding exon (relative to transcript orientation), check first (fwd) or last (rev) codon
1547              check_stop:1,      // this is the last coding exon (relative to transcript orientation), check last (fwd) or first (rev) codon
1548              check_donor:1,     // as with check_acceptor
1549              check_region_beg:1,    // do/don't check for splices at this end, eg. in the first or last exon
1550              check_region_end:1,    //
1551              check_utr:1,           // check splice sites (acceptor/donor/region_*) only if not in utr
1552              set_refalt:1;          // set kref,kalt, if set, check also for synonymous events
1553     uint32_t csq;
1554     int tbeg, tend;             // number of trimmed bases from beg and end of ref,alt allele
1555     uint32_t ref_beg,           // ref coordinates with spurious bases removed, ACC>AC can become AC>A or CC>C, whichever gives
1556              ref_end;           // a more conservative csq (the first and last base in kref.s)
1557     kstring_t kref, kalt;       // trimmed alleles, set only with SPLICE_OLAP
1558 }
1559 splice_t;
splice_init(splice_t * splice,bcf1_t * rec)1560 void splice_init(splice_t *splice, bcf1_t *rec)
1561 {
1562     memset(splice,0,sizeof(*splice));
1563     splice->vcf.rec  = rec;
1564     splice->vcf.pos  = rec->pos;
1565     splice->vcf.rlen = rec->rlen;
1566     splice->vcf.ref  = rec->d.allele[0];
1567     splice->csq      = 0;
1568 }
splice_build_hap(splice_t * splice,uint32_t beg,int len)1569 static inline void splice_build_hap(splice_t *splice, uint32_t beg, int len)
1570 {
1571     // len>0 .. beg is the first base, del filled from right
1572     // len<0 .. beg is the last base, del filled from left
1573 
1574     int rlen, alen, rbeg, abeg;     // first base to include (ref coordinates)
1575     if ( len<0 )
1576     {
1577         rlen = alen = -len;
1578         rbeg = beg - rlen + 1;
1579         int dlen = splice->vcf.alen - splice->vcf.rlen;
1580         if ( dlen<0 && beg < splice->ref_end ) // incomplete del, beg is in the middle
1581             dlen += splice->ref_end - beg;
1582         abeg = rbeg + dlen;
1583     }
1584     else
1585     {
1586         rbeg = abeg = beg;
1587         rlen = alen = len;
1588         // check for incomplete del as above??
1589     }
1590 
1591 #define XDBG 0
1592 #if XDBG
1593 fprintf(stderr,"build_hap:  rbeg=%d + %d    abeg=%d \n",rbeg,rlen,abeg);
1594 #endif
1595     splice->kref.l = 0;
1596     splice->kalt.l = 0;
1597 
1598     // add the part before vcf.ref, in the vcf.ref and after vcf.ref
1599     int roff;   // how many vcf.ref bases already used
1600     if ( rbeg < splice->vcf.pos )
1601     {
1602         assert( splice->tr->beg <= rbeg );  // this can be extended thanks to N_REF_PAD
1603         kputsn(splice->tr->ref + N_REF_PAD + rbeg - splice->tr->beg, splice->vcf.pos - rbeg, &splice->kref);
1604         roff = 0;
1605     }
1606     else
1607         roff = rbeg - splice->vcf.pos;
1608 #if XDBG
1609 fprintf(stderr,"r1: %s  roff=%d\n",splice->kref.s,roff);
1610 #endif
1611 
1612     if ( roff < splice->vcf.rlen && splice->kref.l < rlen )
1613     {
1614         int len = splice->vcf.rlen - roff;  // len still available in vcf.ref
1615         if ( len > rlen - splice->kref.l ) len = rlen - splice->kref.l; // how much of ref allele is still needed
1616         kputsn(splice->vcf.ref + roff, len, &splice->kref);
1617     }
1618 #if XDBG
1619 fprintf(stderr,"r2: %s\n",splice->kref.s);
1620 #endif
1621 
1622     uint32_t end = splice->vcf.pos + splice->vcf.rlen;    // position just after the ref allele
1623     if ( splice->kref.l < rlen )
1624     {
1625         if ( end + rlen - splice->kref.l - 1 > splice->tr->end ) // trim, the requested sequence is too long (could be extended, see N_REF_PAD)
1626             rlen -= end + rlen - splice->kref.l - 1 - splice->tr->end;
1627         if ( splice->kref.l < rlen )
1628             kputsn(splice->tr->ref + N_REF_PAD + end - splice->tr->beg, rlen - splice->kref.l, &splice->kref);
1629     }
1630 #if XDBG
1631 fprintf(stderr,"r3: %s\n",splice->kref.s);
1632 #endif
1633 
1634 
1635     int aoff;
1636     if ( abeg < splice->vcf.pos )
1637     {
1638         assert( splice->tr->beg <= abeg );
1639         kputsn(splice->tr->ref + N_REF_PAD + abeg - splice->tr->beg, splice->vcf.pos - abeg, &splice->kalt);
1640         aoff = 0;
1641     }
1642     else
1643         aoff = abeg - splice->vcf.pos;
1644 #if XDBG
1645 fprintf(stderr,"a1: %s  aoff=%d\n",splice->kalt.s,aoff);
1646 #endif
1647 
1648     if ( aoff < splice->vcf.alen && splice->kalt.l < alen )
1649     {
1650         int len = splice->vcf.alen - aoff;  // len still available in vcf.alt
1651         if ( len > alen - splice->kalt.l ) len = alen - splice->kalt.l; // how much of alt allele is still needed
1652         kputsn(splice->vcf.alt + aoff, len, &splice->kalt);
1653         aoff -= len;
1654     }
1655     if ( aoff < 0 ) aoff = 0;
1656     else aoff--;
1657 #if XDBG
1658 fprintf(stderr,"a2: %s  aoff=%d\n",splice->kalt.s,aoff);
1659 #endif
1660 
1661     end = splice->vcf.pos + splice->vcf.rlen;    // position just after the ref allele
1662     if ( splice->kalt.l < alen )
1663     {
1664         if ( end + alen + aoff - splice->kalt.l - 1 > splice->tr->end ) // trim, the requested sequence is too long
1665             alen -= end + alen + aoff - splice->kalt.l - 1 - splice->tr->end;
1666         if ( alen > 0 && alen > splice->kalt.l )
1667             kputsn(splice->tr->ref + aoff + N_REF_PAD + end - splice->tr->beg, alen - splice->kalt.l, &splice->kalt);
1668     }
1669 #if XDBG
1670 fprintf(stderr,"a3: %s\n",splice->kalt.s);
1671 fprintf(stderr," [%s]\n [%s]\n\n",splice->kref.s,splice->kalt.s);
1672 #endif
1673 }
1674 void csq_stage(args_t *args, csq_t *csq, bcf1_t *rec);
csq_stage_utr(args_t * args,regitr_t * itr,bcf1_t * rec,uint32_t trid,uint32_t type,int ial)1675 static inline int csq_stage_utr(args_t *args, regitr_t *itr, bcf1_t *rec, uint32_t trid, uint32_t type, int ial)
1676 {
1677     while ( regitr_overlap(itr) )
1678     {
1679         gf_utr_t *utr = regitr_payload(itr, gf_utr_t*);
1680         tscript_t *tr = utr->tr;
1681         if ( tr->id != trid ) continue;
1682         csq_t csq;
1683         memset(&csq, 0, sizeof(csq_t));
1684         csq.pos          = rec->pos;
1685         csq.type.type    = (utr->which==prime5 ? CSQ_UTR5 : CSQ_UTR3) | type;
1686         csq.type.biotype = tr->type;
1687         csq.type.strand  = tr->strand;
1688         csq.type.trid    = tr->id;
1689         csq.type.vcf_ial = ial;
1690         csq.type.gene    = tr->gene->name;
1691         csq_stage(args, &csq, rec);
1692         return csq.type.type;
1693     }
1694     return 0;
1695 }
csq_stage_splice(args_t * args,bcf1_t * rec,tscript_t * tr,uint32_t type,int ial)1696 static inline void csq_stage_splice(args_t *args, bcf1_t *rec, tscript_t *tr, uint32_t type, int ial)
1697 {
1698 #if XDBG
1699 fprintf(stderr,"csq_stage_splice %d: type=%d\n",rec->pos+1,type);
1700 #endif
1701     if ( !type ) return;
1702     csq_t csq;
1703     memset(&csq, 0, sizeof(csq_t));
1704     csq.pos          = rec->pos;
1705     csq.type.type    = type;
1706     csq.type.biotype = tr->type;
1707     csq.type.strand  = tr->strand;
1708     csq.type.trid    = tr->id;
1709     csq.type.vcf_ial = ial;
1710     csq.type.gene    = tr->gene->name;
1711     csq_stage(args, &csq, rec);
1712 }
splice_csq_ins(args_t * args,splice_t * splice,uint32_t ex_beg,uint32_t ex_end)1713 static inline int splice_csq_ins(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end)
1714 {
1715     // coordinates that matter for consequences, eg AC>ACG trimmed to C>CG, 1bp
1716     // before and after the inserted bases
1717     if ( splice->tbeg || splice->vcf.ref[0]!=splice->vcf.alt[0] )
1718     {
1719         splice->ref_beg = splice->vcf.pos + splice->tbeg - 1;
1720         splice->ref_end = splice->vcf.pos + splice->vcf.rlen - splice->tend;
1721     }
1722     else
1723     {
1724         if ( splice->tend ) splice->tend--;
1725         splice->ref_beg = splice->vcf.pos;
1726         splice->ref_end = splice->vcf.pos + splice->vcf.rlen - splice->tend;
1727     }
1728 #if XDBG
1729 fprintf(stderr,"ins: %s>%s .. ex=%d,%d  beg,end=%d,%d  tbeg,tend=%d,%d  check_utr=%d start,stop,beg,end=%d,%d,%d,%d\n", splice->vcf.ref,splice->vcf.alt,ex_beg,ex_end,splice->ref_beg,splice->ref_end,splice->tbeg,splice->tend,splice->check_utr,splice->check_start,splice->check_stop,splice->check_region_beg,splice->check_region_end);
1730 #endif
1731 
1732     int ret;
1733     if ( splice->ref_beg >= ex_end )   // fully outside, beyond the exon
1734     {
1735         if ( splice->check_utr )
1736         {
1737             regitr_t *itr = regitr_init(NULL);
1738             const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
1739             if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg+1,splice->ref_beg+1, itr) )     // adjacent utr
1740             {
1741                 ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
1742                 if ( ret!=0 )
1743                 {
1744                     regitr_destroy(itr);
1745                     return SPLICE_OUTSIDE; // overlaps utr
1746                 }
1747             }
1748             regitr_destroy(itr);
1749         }
1750         if ( !splice->check_region_end ) return SPLICE_OUTSIDE;
1751         char *ref = NULL, *alt = NULL;
1752         if ( splice->set_refalt )   // seq identity is checked only when tr->ref is available
1753         {
1754             splice_build_hap(splice, ex_end+1, N_SPLICE_REGION_INTRON);
1755             ref = splice->kref.s, alt = splice->kalt.s;
1756         }
1757         if ( splice->ref_beg < ex_end + N_SPLICE_REGION_INTRON && splice->ref_end > ex_end + N_SPLICE_DONOR )
1758         {
1759             splice->csq |= CSQ_SPLICE_REGION;
1760             if ( ref && !strncmp(ref,alt,N_SPLICE_REGION_INTRON) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT;
1761         }
1762         if ( splice->ref_beg < ex_end + N_SPLICE_DONOR )
1763         {
1764             if ( splice->check_donor && splice->tr->strand==STRAND_FWD ) splice->csq |= CSQ_SPLICE_DONOR;
1765             if ( splice->check_acceptor && splice->tr->strand==STRAND_REV ) splice->csq |= CSQ_SPLICE_ACCEPTOR;
1766             if ( ref && !strncmp(ref,alt,N_SPLICE_DONOR) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT;
1767         }
1768         csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq, splice->vcf.ial);
1769         return SPLICE_OUTSIDE;
1770     }
1771     if ( splice->ref_end < ex_beg || (splice->ref_end == ex_beg && !splice->check_region_beg) )    // fully outside, before the exon
1772     {
1773         if ( splice->check_utr )
1774         {
1775             regitr_t *itr = regitr_init(NULL);
1776             const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
1777             if ( regidx_overlap(args->idx_utr,chr,splice->ref_end-1,splice->ref_end-1, itr) )     // adjacent utr
1778             {
1779                 ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
1780                 if ( ret!=0 )
1781                 {
1782                     regitr_destroy(itr);
1783                     return SPLICE_OUTSIDE; // overlaps utr
1784                 }
1785             }
1786             regitr_destroy(itr);
1787         }
1788         if ( !splice->check_region_beg ) return SPLICE_OUTSIDE;
1789         char *ref = NULL, *alt = NULL;
1790         if ( splice->set_refalt )   // seq identity is checked only when tr->ref is available
1791         {
1792             splice_build_hap(splice, ex_beg - N_SPLICE_REGION_INTRON, N_SPLICE_REGION_INTRON);
1793             ref = splice->kref.s, alt = splice->kalt.s;
1794         }
1795         if ( splice->ref_end > ex_beg - N_SPLICE_REGION_INTRON && splice->ref_beg < ex_beg - N_SPLICE_DONOR )
1796         {
1797             splice->csq |= CSQ_SPLICE_REGION;
1798             if ( ref && !strncmp(ref,alt,N_SPLICE_REGION_INTRON) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT;
1799         }
1800         if ( splice->ref_end > ex_beg - N_SPLICE_DONOR )
1801         {
1802             if ( splice->check_donor && splice->tr->strand==STRAND_REV ) splice->csq |= CSQ_SPLICE_DONOR;
1803             if ( splice->check_acceptor && splice->tr->strand==STRAND_FWD ) splice->csq |= CSQ_SPLICE_ACCEPTOR;
1804             if ( ref && !strncmp(ref+N_SPLICE_REGION_INTRON-N_SPLICE_DONOR,alt+N_SPLICE_REGION_INTRON-N_SPLICE_DONOR,N_SPLICE_DONOR) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT;
1805         }
1806         csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq, splice->vcf.ial);
1807         return SPLICE_OUTSIDE;
1808     }
1809     // overlaps the exon or inside the exon
1810     // possible todo: find better alignment for frameshifting variants?
1811     if ( splice->ref_beg <= ex_beg + 2 )    // in the first 3bp
1812     {
1813         if ( splice->check_region_beg ) splice->csq |= CSQ_SPLICE_REGION;
1814         if ( splice->tr->strand==STRAND_FWD ) { if ( splice->check_start ) splice->csq |= CSQ_START_LOST; }
1815         else { if ( splice->check_stop ) splice->csq |= CSQ_STOP_LOST; }
1816     }
1817     if ( splice->ref_end > ex_end - 2 )
1818     {
1819         if ( splice->check_region_end ) splice->csq |= CSQ_SPLICE_REGION;
1820         if ( splice->tr->strand==STRAND_REV ) { if ( splice->check_start ) splice->csq |= CSQ_START_LOST; }
1821         else { if ( splice->check_stop ) splice->csq |= CSQ_STOP_LOST; }
1822     }
1823     if ( splice->set_refalt )
1824     {
1825         // Make sure the variant will not end up left aligned to avoid overlapping vcf records
1826         //      splice_build_hap(splice, splice->ref_beg, splice->vcf.alen - splice->tend - splice->tbeg + 1);
1827         //      splice->vcf.rlen -= splice->tbeg + splice->tend - 1;
1828         //      if ( splice->kref.l > splice->vcf.rlen ) { splice->kref.l = splice->vcf.rlen;  splice->kref.s[splice->kref.l] = 0; }
1829         if ( splice->ref_beg < splice->vcf.pos )    // this must have been caused by too much trimming from right
1830         {
1831             int dlen = splice->vcf.pos - splice->ref_beg;
1832             assert( dlen==1 );
1833             splice->tbeg += dlen;
1834             if ( splice->tbeg + splice->tend == splice->vcf.rlen ) splice->tend -= dlen;
1835             splice->ref_beg = splice->vcf.pos;
1836         }
1837         if ( splice->ref_end==ex_beg ) splice->tend--;  // prevent zero-length ref allele
1838         splice_build_hap(splice, splice->ref_beg, splice->vcf.alen - splice->tend - splice->tbeg + 1);
1839         splice->vcf.rlen -= splice->tbeg + splice->tend - 1;
1840         if ( splice->kref.l > splice->vcf.rlen ) { splice->kref.l = splice->vcf.rlen;  splice->kref.s[splice->kref.l] = 0; }
1841     }
1842     csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq, splice->vcf.ial);
1843     return SPLICE_INSIDE;
1844 }
1845 
shifted_del_synonymous(args_t * args,splice_t * splice,uint32_t ex_beg,uint32_t ex_end)1846 int shifted_del_synonymous(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end)
1847 {
1848     static int small_ref_padding_warned = 0;
1849     tscript_t *tr = splice->tr;
1850 
1851     // We know the VCF record overlaps the exon, but does it overlap the start codon?
1852     if ( tr->strand==STRAND_REV && splice->vcf.pos + splice->vcf.rlen + 2 <= ex_end ) return 0;
1853     if ( tr->strand==STRAND_FWD && splice->vcf.pos >= ex_beg + 3 ) return 0;
1854 
1855 #if XDBG
1856     fprintf(stderr,"shifted_del_synonymous: %d-%d  %s\n",ex_beg,ex_end, tr->strand==STRAND_FWD?"fwd":"rev");
1857     fprintf(stderr,"   %d  ..  %s > %s\n",splice->vcf.pos+1,splice->vcf.ref,splice->vcf.alt);
1858 #endif
1859 
1860     // is there enough ref sequence for the extension? All coordinates are 0-based
1861     int ref_len = strlen(splice->vcf.ref);
1862     int alt_len = strlen(splice->vcf.alt);
1863     assert( ref_len > alt_len );
1864     int ndel = ref_len - alt_len;
1865 
1866     if ( tr->strand==STRAND_REV )
1867     {
1868         int32_t vcf_ref_end = splice->vcf.pos + ref_len - 1;  // end pos of the VCF REF allele
1869         int32_t tr_ref_end  = splice->tr->end + N_REF_PAD;    // the end pos of accessible cached ref seq
1870         if ( vcf_ref_end + ndel > tr_ref_end )
1871         {
1872             if ( !small_ref_padding_warned )
1873             {
1874                 fprintf(stderr,"Warning: Could not verify synonymous start/stop at %s:%d due to small N_REF_PAD. (Improve me?)\n",bcf_seqname(args->hdr,splice->vcf.rec),splice->vcf.pos+1);
1875                 small_ref_padding_warned = 1;
1876             }
1877             return 0;
1878         }
1879 
1880         char *ptr_vcf = splice->vcf.ref + alt_len;                         // the first deleted base in the VCF REF allele
1881         char *ptr_ref = splice->tr->ref + N_REF_PAD + (vcf_ref_end + 1 - splice->tr->beg);  // the first ref base after the ndel bases deleted
1882 #if XDBG
1883         fprintf(stderr,"vcf: %s\nref: %s\n",ptr_vcf,ptr_ref);
1884 #endif
1885         int i = 0;
1886         while ( ptr_vcf[i] && ptr_vcf[i]==ptr_ref[i] ) i++;
1887         if ( ptr_vcf[i] ) return 0;       // the deleted sequence cannot be replaced
1888     }
1889     else
1890     {
1891         // STRAND_FWD
1892         int32_t vcf_block_beg = splice->vcf.pos + ref_len - 2*ndel;        // the position of the first base of the ref block that could potentially replace the deletion
1893         if ( vcf_block_beg < 0 ) return 0;
1894 
1895 #if XDBG
1896         fprintf(stderr,"vcf_block_beg: %d\n",vcf_block_beg+1);
1897 #endif
1898 
1899         if ( N_REF_PAD + vcf_block_beg < ex_beg )
1900         {
1901             if ( !small_ref_padding_warned )
1902             {
1903                 fprintf(stderr,"Warning: Could not verify synonymous start/stop at %s:%d due to small N_REF_PAD. (Improve me?)\n",bcf_seqname(args->hdr,splice->vcf.rec),splice->vcf.pos+1);
1904                 small_ref_padding_warned = 1;
1905             }
1906             return 0;
1907         }
1908 
1909         char *ptr_vcf = splice->vcf.ref + alt_len;                                      // the first deleted base in the VCF REF allele
1910         char *ptr_ref = splice->tr->ref + N_REF_PAD + vcf_block_beg - splice->tr->beg;  // the replacement ref block
1911 #if XDBG
1912         fprintf(stderr,"vcf: %s\nref: %s\n",ptr_vcf,ptr_ref);
1913 #endif
1914 
1915         int i = 0;
1916         while ( ptr_vcf[i] && ptr_vcf[i]==ptr_ref[i] ) i++;
1917         if ( ptr_vcf[i] ) return 0;       // the deleted sequence cannot be replaced
1918     }
1919 
1920     return 1;
1921 }
1922 
splice_csq_del(args_t * args,splice_t * splice,uint32_t ex_beg,uint32_t ex_end)1923 static inline int splice_csq_del(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end)
1924 {
1925     if ( splice->check_start )
1926     {
1927         // check for synonymous start
1928         //      test/csq/ENST00000375992/incorrect-synon-del-not-start-lost.txt
1929         //      test/csq/ENST00000368801.2/start-lost.txt
1930         //      test/csq/ENST00000318249.2/synonymous-start-lost.txt
1931         int is_synonymous = shifted_del_synonymous(args, splice, ex_beg, ex_end);
1932         if ( is_synonymous )
1933         {
1934             splice->csq |= CSQ_START_RETAINED;
1935             return SPLICE_OVERLAP;
1936         }
1937     }
1938 
1939     // coordinates that matter for consequences, eg AC>ACG trimmed to C>CG
1940     splice->ref_beg = splice->vcf.pos + splice->tbeg - 1;                       // 1b before the deleted base
1941     splice->ref_end = splice->vcf.pos + splice->vcf.rlen - splice->tend - 1;    // the last deleted base
1942 
1943 #if XDBG
1944 fprintf(stderr,"splice_csq_del: %s>%s .. ex=%d,%d  beg,end=%d,%d  tbeg,tend=%d,%d  check_utr=%d start,stop,beg,end=%d,%d,%d,%d\n", splice->vcf.ref,splice->vcf.alt,ex_beg,ex_end,splice->ref_beg,splice->ref_end,splice->tbeg,splice->tend,splice->check_utr,splice->check_start,splice->check_stop,splice->check_region_beg,splice->check_region_end);
1945 #endif
1946 
1947     if ( splice->ref_beg + 1 < ex_beg )     // the part before the exon; ref_beg is off by -1
1948     {
1949         if ( splice->check_region_beg )
1950         {
1951             int csq = 0;
1952             if ( splice->check_utr )
1953             {
1954                 regitr_t *itr = regitr_init(NULL);
1955                 const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
1956                 if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg,ex_beg-1, itr) )     // adjacent utr
1957                     csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
1958                 regitr_destroy(itr);
1959             }
1960             if ( !csq )
1961             {
1962                 char *ref = NULL, *alt = NULL;
1963                 if ( splice->set_refalt )   // seq identity is checked only when tr->ref is available
1964                 {
1965                     // filling from the left does not work for ENST00000341065/frame3.vcf
1966                     //    CAG.GTGGCCAG      CAG.GTGGCCAG
1967                     //    CA-.--GGCCAG  vs  CAG.---GCCAG
1968                     //  splice_build_hap(splice, ex_beg-1, -N_SPLICE_REGION_INTRON);
1969                     //
1970                     // filling from the right:
1971                     splice_build_hap(splice, ex_beg - N_SPLICE_REGION_INTRON, N_SPLICE_REGION_INTRON);
1972                     ref = splice->kref.s, alt = splice->kalt.s;
1973                 }
1974                 if ( splice->ref_end >= ex_beg - N_SPLICE_REGION_INTRON && splice->ref_beg < ex_beg - N_SPLICE_DONOR )
1975                 {
1976                     splice->csq |= CSQ_SPLICE_REGION;
1977                     if ( ref && alt && !strncmp(ref,alt,N_SPLICE_REGION_INTRON) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT;
1978                 }
1979                 if ( splice->ref_end >= ex_beg - N_SPLICE_DONOR )
1980                 {
1981                     if ( splice->check_donor && splice->tr->strand==STRAND_REV ) splice->csq |= CSQ_SPLICE_DONOR;
1982                     if ( splice->check_acceptor && splice->tr->strand==STRAND_FWD ) splice->csq |= CSQ_SPLICE_ACCEPTOR;
1983                     if ( ref && alt && !strncmp(ref+N_SPLICE_REGION_INTRON-N_SPLICE_DONOR,alt+N_SPLICE_REGION_INTRON-N_SPLICE_DONOR,N_SPLICE_DONOR) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT;
1984                 }
1985             }
1986         }
1987         if ( splice->ref_end >= ex_beg )
1988         {
1989             splice->tbeg = splice->ref_beg - splice->vcf.pos + 1;
1990             splice->ref_beg = ex_beg - 1;
1991             if ( splice->tbeg + splice->tend == splice->vcf.alen )
1992             {
1993                 // the deletion overlaps ex_beg and cannot be easily realigned to the right
1994                 if ( !splice->tend )
1995                 {
1996                     splice->csq |= CSQ_CODING_SEQUENCE;
1997                     return SPLICE_OVERLAP;
1998                 }
1999                 splice->tend--;
2000             }
2001         }
2002     }
2003     if ( ex_end < splice->ref_end )     // the part after the exon
2004     {
2005         if ( splice->check_region_end )
2006         {
2007             int csq = 0;
2008             if ( splice->check_utr )
2009             {
2010                 regitr_t *itr = regitr_init(NULL);
2011                 const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
2012                 if ( regidx_overlap(args->idx_utr,chr,ex_end+1,splice->ref_end, itr) )     // adjacent utr
2013                     csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
2014                 regitr_destroy(itr);
2015             }
2016             if ( !csq )
2017             {
2018                 char *ref = NULL, *alt = NULL;
2019                 if ( splice->set_refalt )   // seq identity is checked only when tr->ref is available
2020                 {
2021                     splice_build_hap(splice, ex_end+1, N_SPLICE_REGION_INTRON);  // ref,alt positioned at the first intron base
2022                     ref = splice->kref.s, alt = splice->kalt.s;
2023                 }
2024                 if ( splice->ref_beg < ex_end + N_SPLICE_REGION_INTRON && splice->ref_end > ex_end + N_SPLICE_DONOR )
2025                 {
2026                     splice->csq |= CSQ_SPLICE_REGION;
2027                     if ( ref && alt && !strncmp(ref,alt,N_SPLICE_REGION_INTRON) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT;
2028                 }
2029                 if ( splice->ref_beg < ex_end + N_SPLICE_DONOR )
2030                 {
2031                     if ( splice->check_donor && splice->tr->strand==STRAND_FWD ) splice->csq |= CSQ_SPLICE_DONOR;
2032                     if ( splice->check_acceptor && splice->tr->strand==STRAND_REV ) splice->csq |= CSQ_SPLICE_ACCEPTOR;
2033                     if ( ref && alt && !strncmp(ref+N_SPLICE_REGION_INTRON-N_SPLICE_DONOR,alt+N_SPLICE_REGION_INTRON-N_SPLICE_DONOR,N_SPLICE_DONOR) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT;
2034                 }
2035             }
2036         }
2037         if ( splice->ref_beg < ex_end )
2038         {
2039             splice->tend = splice->vcf.rlen - (splice->ref_end - splice->vcf.pos + 1);
2040             splice->ref_end = ex_end;
2041         }
2042     }
2043     if ( splice->ref_end < ex_beg || splice->ref_beg >= ex_end )
2044     {
2045         csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq, splice->vcf.ial);
2046         return SPLICE_OUTSIDE;
2047     }
2048     if ( splice->ref_beg < ex_beg + 2 ) // ref_beg is off by -1
2049     {
2050         if ( splice->check_region_beg ) splice->csq |= CSQ_SPLICE_REGION;
2051         if ( splice->tr->strand==STRAND_FWD ) { if ( splice->check_start ) splice->csq |= CSQ_START_LOST; }
2052         else { if ( splice->check_stop ) splice->csq |= CSQ_STOP_LOST; }
2053     }
2054     if ( splice->ref_end > ex_end - 3 )
2055     {
2056         if ( splice->check_region_end ) splice->csq |= CSQ_SPLICE_REGION;
2057         if ( splice->tr->strand==STRAND_REV ) { if ( splice->check_start ) splice->csq |= CSQ_START_LOST; }
2058         else { if ( splice->check_stop ) splice->csq |= CSQ_STOP_LOST; }
2059     }
2060     if ( splice->set_refalt )
2061     {
2062         if ( splice->tbeg>0 ) splice->tbeg--;  //why is this?
2063         if ( splice->vcf.rlen > splice->tbeg + splice->tend && splice->vcf.alen > splice->tbeg + splice->tend )
2064         {
2065             splice->vcf.rlen -= splice->tbeg + splice->tend;
2066             splice->vcf.alen -= splice->tbeg + splice->tend;
2067         }
2068         splice->kref.l = 0; kputsn(splice->vcf.ref + splice->tbeg, splice->vcf.rlen, &splice->kref);
2069         splice->kalt.l = 0; kputsn(splice->vcf.alt + splice->tbeg, splice->vcf.alen, &splice->kalt);
2070         if ( (splice->ref_beg+1 < ex_beg && splice->ref_end >= ex_beg) || (splice->ref_beg+1 < ex_end && splice->ref_end >= ex_end) ) // ouch, ugly ENST00000409523/long-overlapping-del.vcf
2071         {
2072             splice->csq |= (splice->ref_end - splice->ref_beg)%3 ? CSQ_FRAMESHIFT_VARIANT : CSQ_INFRAME_DELETION;
2073             return SPLICE_OVERLAP;
2074         }
2075     }
2076     csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq, splice->vcf.ial);
2077     return SPLICE_INSIDE;
2078 }
2079 
splice_csq_mnp(args_t * args,splice_t * splice,uint32_t ex_beg,uint32_t ex_end)2080 static inline int splice_csq_mnp(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end)
2081 {
2082     // not a real variant, can be ignored: eg ACGT>ACGT
2083     if ( splice->tbeg + splice->tend == splice->vcf.rlen ) return SPLICE_VAR_REF;
2084 
2085     splice->ref_beg = splice->vcf.pos + splice->tbeg;
2086     splice->ref_end = splice->vcf.pos + splice->vcf.rlen - splice->tend - 1;
2087 
2088 #if XDBG
2089 fprintf(stderr,"mnp: %s>%s .. ex=%d,%d  beg,end=%d,%d  tbeg,tend=%d,%d  check_utr=%d start,stop,beg,end=%d,%d,%d,%d\n", splice->vcf.ref,splice->vcf.alt,ex_beg,ex_end,splice->ref_beg,splice->ref_end,splice->tbeg,splice->tend,splice->check_utr,splice->check_start,splice->check_stop,splice->check_region_beg,splice->check_region_end);
2090 #endif
2091 
2092     if ( splice->ref_beg < ex_beg )     // the part before the exon
2093     {
2094         if ( splice->check_region_beg )
2095         {
2096             int csq = 0;
2097             if ( splice->check_utr )
2098             {
2099                 regitr_t *itr = regitr_init(NULL);
2100                 const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
2101                 if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg,ex_beg-1, itr) )     // adjacent utr
2102                     csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
2103                 regitr_destroy(itr);
2104             }
2105             if ( !csq )
2106             {
2107                 if ( splice->ref_end >= ex_beg - N_SPLICE_REGION_INTRON && splice->ref_beg < ex_beg - N_SPLICE_DONOR )
2108                     splice->csq |= CSQ_SPLICE_REGION;
2109                 if ( splice->ref_end >= ex_beg - N_SPLICE_DONOR )
2110                 {
2111                     if ( splice->check_donor && splice->tr->strand==STRAND_REV ) splice->csq |= CSQ_SPLICE_DONOR;
2112                     if ( splice->check_acceptor && splice->tr->strand==STRAND_FWD ) splice->csq |= CSQ_SPLICE_ACCEPTOR;
2113                 }
2114             }
2115         }
2116         if ( splice->ref_end >= ex_beg )
2117         {
2118             splice->tbeg = splice->ref_beg - splice->vcf.pos;
2119             splice->ref_beg = ex_beg;
2120         }
2121     }
2122     if ( ex_end < splice->ref_end )     // the part after the exon
2123     {
2124         if ( splice->check_region_end )
2125         {
2126             int csq = 0;
2127             if ( splice->check_utr )
2128             {
2129                 regitr_t *itr = regitr_init(NULL);
2130                 const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
2131                 if ( regidx_overlap(args->idx_utr,chr,ex_end+1,splice->ref_end, itr) )     // adjacent utr
2132                     csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
2133                 regitr_destroy(itr);
2134             }
2135             if ( !csq )
2136             {
2137                 if ( splice->ref_beg <= ex_end + N_SPLICE_REGION_INTRON && splice->ref_end > ex_end + N_SPLICE_DONOR )
2138                     splice->csq |= CSQ_SPLICE_REGION;
2139                 if ( splice->ref_beg <= ex_end + N_SPLICE_DONOR )
2140                 {
2141                     if ( splice->check_donor && splice->tr->strand==STRAND_FWD ) splice->csq |= CSQ_SPLICE_DONOR;
2142                     if ( splice->check_acceptor && splice->tr->strand==STRAND_REV ) splice->csq |= CSQ_SPLICE_ACCEPTOR;
2143                 }
2144             }
2145         }
2146         if ( splice->ref_beg <= ex_end )
2147         {
2148             splice->tend = splice->vcf.rlen - (splice->ref_end - splice->vcf.pos + 1);
2149             splice->ref_end = ex_end;
2150         }
2151     }
2152     if ( splice->ref_end < ex_beg || splice->ref_beg > ex_end )
2153     {
2154         csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq, splice->vcf.ial);
2155         return SPLICE_OUTSIDE;
2156     }
2157 
2158     if ( splice->ref_beg < ex_beg + 3 )
2159     {
2160         if ( splice->check_region_beg ) splice->csq |= CSQ_SPLICE_REGION;
2161         if ( splice->tr->strand==STRAND_FWD ) { if ( splice->check_start ) splice->csq |= CSQ_START_LOST; }
2162         else { if ( splice->check_stop ) splice->csq |= CSQ_STOP_LOST; }
2163     }
2164     if ( splice->ref_end > ex_end - 3 )
2165     {
2166         if ( splice->check_region_end ) splice->csq |= CSQ_SPLICE_REGION;
2167         if ( splice->tr->strand==STRAND_REV ) { if ( splice->check_start ) splice->csq |= CSQ_START_LOST; }
2168         else { if ( splice->check_stop ) splice->csq |= CSQ_STOP_LOST; }
2169     }
2170     if ( splice->set_refalt )
2171     {
2172         splice->vcf.rlen -= splice->tbeg + splice->tend;
2173         splice->kref.l = 0; kputsn(splice->vcf.ref + splice->tbeg, splice->vcf.rlen, &splice->kref);
2174         splice->kalt.l = 0; kputsn(splice->vcf.alt + splice->tbeg, splice->vcf.rlen, &splice->kalt);
2175     }
2176     csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq, splice->vcf.ial);
2177     return SPLICE_INSIDE;
2178 }
splice_csq(args_t * args,splice_t * splice,uint32_t ex_beg,uint32_t ex_end)2179 static inline int splice_csq(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end)
2180 {
2181     splice->vcf.alen = strlen(splice->vcf.alt);
2182 
2183     int rlen1 = splice->vcf.rlen - 1, alen1 = splice->vcf.alen - 1, i = 0;
2184     splice->tbeg = 0, splice->tend = 0;
2185 
2186     // trim from the right, then from the left
2187     while ( i<=rlen1 && i<=alen1 )
2188     {
2189         if ( splice->vcf.ref[rlen1-i] != splice->vcf.alt[alen1-i] ) break;
2190         i++;
2191     }
2192     splice->tend = i;
2193     rlen1 -= i, alen1 -= i, i = 0;
2194     while ( i<=rlen1 && i<=alen1 )
2195     {
2196         if ( splice->vcf.ref[i] != splice->vcf.alt[i] ) break;
2197         i++;
2198     }
2199     splice->tbeg = i;
2200 
2201     // The mnp, ins and del code was split into near-identical functions for clarity and debugging;
2202     // possible todo: generalize once stable
2203     if ( splice->vcf.rlen==splice->vcf.alen ) return splice_csq_mnp(args, splice, ex_beg, ex_end);
2204     if ( splice->vcf.rlen < splice->vcf.alen ) return splice_csq_ins(args, splice, ex_beg, ex_end);
2205     if ( splice->vcf.rlen > splice->vcf.alen ) return splice_csq_del(args, splice, ex_beg, ex_end);
2206 
2207     return 0;
2208 }
2209 
2210 
2211 // return value: 0 added, 1 overlapping variant, 2 silent discard (intronic,alt=ref)
hap_init(args_t * args,hap_node_t * parent,hap_node_t * child,gf_cds_t * cds,bcf1_t * rec,int ial)2212 int hap_init(args_t *args, hap_node_t *parent, hap_node_t *child, gf_cds_t *cds, bcf1_t *rec, int ial)
2213 {
2214     int i;
2215     kstring_t str = {0,0,0};
2216     tscript_t *tr = cds->tr;
2217     child->icds = cds->icds;     // index of cds in the tscript's list of exons
2218     child->vcf_ial = ial;
2219 
2220     splice_t splice;
2221     splice_init(&splice, rec);
2222     splice.tr = tr;
2223     splice.vcf.ial  = ial;
2224     splice.vcf.alt  = rec->d.allele[ial];
2225     splice.check_acceptor = splice.check_donor = splice.set_refalt = splice.check_utr = 1;
2226     if ( !(tr->trim & TRIM_5PRIME) )
2227     {
2228         if ( tr->strand==STRAND_FWD ) { if ( child->icds==0 ) splice.check_start = 1; }
2229         else { if ( child->icds==tr->ncds-1 ) splice.check_start = 1; }
2230     }
2231     if ( !(tr->trim & TRIM_3PRIME) )
2232     {
2233         if ( tr->strand==STRAND_FWD ) { if ( child->icds==tr->ncds-1 ) splice.check_stop = 1; }
2234         else { if ( child->icds==0 ) splice.check_stop = 1; }
2235     }
2236     if ( splice.check_start )   // do not check starts in incomplete CDS, defined as not starting with M
2237     {
2238         if ( tr->strand==STRAND_FWD ) { if ( dna2aa(tr->ref+N_REF_PAD+cds->beg-tr->beg) != 'M' ) splice.check_start = 0; }
2239         else { if ( cdna2aa(tr->ref+N_REF_PAD+cds->beg-tr->beg+cds->len-3) != 'M' ) splice.check_start = 0; }
2240     }
2241     if ( child->icds!=0 ) splice.check_region_beg = 1;
2242     if ( child->icds!=tr->ncds-1 ) splice.check_region_end = 1;
2243 
2244 #if XDBG
2245 fprintf(stderr,"\nhap_init: %d [%s][%s]   check start:%d,stop:%d\n",splice.vcf.pos+1,splice.vcf.ref,splice.vcf.alt,splice.check_start,splice.check_stop);
2246 #endif
2247     int ret = splice_csq(args, &splice, cds->beg, cds->beg + cds->len - 1);
2248 #if XDBG
2249 fprintf(stderr,"cds splice_csq: %d [%s][%s] .. beg,end=%d %d, ret=%d, csq=%d\n\n",splice.vcf.pos+1,splice.kref.s,splice.kalt.s,splice.ref_beg+1,splice.ref_end+1,ret,splice.csq);
2250 #endif
2251 
2252     if ( ret==SPLICE_VAR_REF ) return 2;  // not a variant, eg REF=CA ALT=CA
2253     if ( ret==SPLICE_OUTSIDE || ret==SPLICE_OVERLAP || splice.csq==CSQ_START_LOST )  // not a coding csq
2254     {
2255         free(splice.kref.s);
2256         free(splice.kalt.s);
2257 
2258         if ( !splice.csq ) return 2;        // fully intronic, no csq
2259 
2260         // splice_region/acceptor/donor
2261         child->seq  = NULL;
2262         child->sbeg = 0;
2263         child->rbeg = rec->pos;
2264         child->rlen = 0;
2265         child->dlen = 0;
2266         kputs(rec->d.allele[0],&str);
2267         kputc('>',&str);
2268         kputs(rec->d.allele[ial],&str);
2269         child->var  = str.s;
2270         child->type = HAP_SSS;
2271         child->csq  = splice.csq;
2272         child->rec  = rec;
2273         return 0;
2274     }
2275     if ( splice.csq & CSQ_SYNONYMOUS_VARIANT ) splice.csq &= ~CSQ_SYNONYMOUS_VARIANT;   // synonymous&splice,frame could become synonymous&frame,splice
2276 
2277     int dbeg = 0;
2278     if ( splice.ref_beg < cds->beg )
2279     {
2280         // The vcf record overlaps the exon boundary, but the variant itself
2281         // should fit inside since we are here. This will need more work.
2282         // #1475227917
2283         dbeg = cds->beg - splice.ref_beg;
2284         splice.kref.l -= dbeg;
2285         splice.ref_beg = cds->beg;
2286         assert( dbeg <= splice.kalt.l );
2287     }
2288 
2289     assert( parent->type!=HAP_SSS );
2290     if ( parent->type==HAP_CDS )
2291     {
2292         i = parent->icds;
2293         if ( i!=cds->icds )
2294         {
2295             // the variant is on a new exon, finish up the previous
2296             int len = tr->cds[i]->len - parent->rbeg - parent->rlen + tr->cds[i]->beg;
2297             if ( len > 0 )
2298                 kputsn_(tr->ref + N_REF_PAD + parent->rbeg + parent->rlen - tr->beg, len, &str);
2299         }
2300 
2301         // append any skipped non-variant exons
2302         while ( ++i < cds->icds )
2303             kputsn_(tr->ref + N_REF_PAD + tr->cds[i]->beg - tr->beg, tr->cds[i]->len, &str);
2304 
2305         if ( parent->icds==child->icds )
2306         {
2307             int len = splice.ref_beg - parent->rbeg - parent->rlen;
2308             if ( len < 0 )   // overlapping variants
2309             {
2310                 free(str.s);
2311                 free(splice.kref.s);
2312                 free(splice.kalt.s);
2313                 return 1;
2314             }
2315             kputsn_(tr->ref + N_REF_PAD + parent->rbeg + parent->rlen - tr->beg, len, &str);
2316         }
2317         else
2318             kputsn_(tr->ref + N_REF_PAD + cds->beg - tr->beg, splice.ref_beg - cds->beg, &str);
2319     }
2320     kputs(splice.kalt.s + dbeg, &str);
2321 
2322     child->seq  = str.s;
2323     child->sbeg = cds->pos + (splice.ref_beg - cds->beg);
2324     child->rbeg = splice.ref_beg;
2325     child->rlen = splice.kref.l;
2326     child->type = HAP_CDS;
2327     child->prev = parent;
2328     child->rec  = rec;
2329     child->csq  = splice.csq;
2330 
2331     // set vlen and the "ref>alt" string
2332     {
2333         int rlen = strlen(rec->d.allele[0]);
2334         int alen = strlen(rec->d.allele[ial]);
2335         child->dlen = alen - rlen;
2336         child->var  = (char*) malloc(rlen+alen+2);
2337         memcpy(child->var,rec->d.allele[0],rlen);
2338         child->var[rlen] = '>';
2339         memcpy(child->var+rlen+1,rec->d.allele[ial],alen);
2340         child->var[rlen+alen+1] = 0;
2341     }
2342 
2343     // yuck, the whole CDS is modified/deleted, not ready for this, todo.
2344     if ( child->rbeg + child->rlen > cds->beg + cds->len )
2345     {
2346         child->type = HAP_SSS;
2347         if ( !child->csq ) child->csq |= CSQ_CODING_SEQUENCE;  // hack, specifically for ENST00000390520/deletion-overlap.vcf
2348     }
2349 
2350 
2351     free(splice.kref.s);
2352     free(splice.kalt.s);
2353     return 0;
2354 }
hap_destroy(hap_node_t * hap)2355 void hap_destroy(hap_node_t *hap)
2356 {
2357     int i;
2358     for (i=0; i<hap->nchild; i++)
2359         if ( hap->child[i] ) hap_destroy(hap->child[i]);
2360     for (i=0; i<hap->mcsq_list; i++) free(hap->csq_list[i].type.vstr.s);
2361     free(hap->csq_list);
2362     free(hap->child);
2363     free(hap->cur_child);
2364     free(hap->seq);
2365     free(hap->var);
2366     free(hap);
2367 }
2368 
2369 
2370 /*
2371     ref:    spliced reference and its length (ref.l)
2372     seq:    part of the spliced query transcript on the reference strand to translate, its
2373                 length (seq.l) and the total length of the complete transcript (seq.m)
2374     sbeg:   seq offset within the spliced query transcript
2375     rbeg:   seq offset within ref, 0-based
2376     rend:   last base of seq within ref, plus one. If seq does not contain indels, it is rend=rbeg+seq->l
2377     strand: coding strand - 0:rev, 1:fwd
2378     tseq:   translated sequence (aa)
2379     fill:   frameshift, fill until the end (strand=fwd) or from the start (strand=rev)
2380  */
cds_translate(kstring_t * _ref,kstring_t * _seq,uint32_t sbeg,uint32_t rbeg,uint32_t rend,int strand,kstring_t * tseq,int fill)2381 void cds_translate(kstring_t *_ref, kstring_t *_seq, uint32_t sbeg, uint32_t rbeg, uint32_t rend, int strand, kstring_t *tseq, int fill)
2382 {
2383 #if XDBG
2384 fprintf(stderr,"\ntranslate: %d %d %d  fill=%d  seq.l=%d\n",sbeg,rbeg,rend,fill,(int)_seq->l);
2385 #endif
2386     char tmp[3], *codon, *end;
2387     int i, len, npad;
2388 
2389     kstring_t ref = *_ref;
2390     kstring_t seq = *_seq;
2391 
2392     tseq->l = 0;
2393     if ( !seq.l )
2394     {
2395         kputc('?', tseq);
2396         return;
2397     }
2398 
2399 #define DBG 0
2400 #if DBG
2401  fprintf(stderr,"translate: sbeg,rbeg,rend=%d %d %d  fill=%d  seq.l=%d\n",sbeg,rbeg,rend,fill,(int)_seq->l);
2402  fprintf(stderr,"    ref: l=%d %s\n", (int)ref.l,ref.s);
2403  fprintf(stderr,"    seq: l=%d m=%d ", (int)seq.l,(int)seq.m);
2404  for (i=0; i<seq.l; i++) fprintf(stderr,"%c",seq.s[i]); fprintf(stderr,"\n");
2405  fprintf(stderr,"    sbeg,rbeg,rend: %d,%d,%d\n", sbeg,rbeg,rend);
2406  fprintf(stderr,"    strand,fill: %d,%d\n", strand,fill);
2407 #endif
2408 
2409     if ( strand==STRAND_FWD )
2410     {
2411         // left padding
2412         npad = sbeg % 3;
2413 #if DBG>1
2414         fprintf(stderr,"    npad: %d\n",npad);
2415 #endif
2416         assert( npad<=rbeg );
2417 
2418         for (i=0; i<npad; i++)
2419             tmp[i] = ref.s[rbeg+i-npad+N_REF_PAD];
2420         for (; i<3 && i-npad<seq.l; i++)
2421             tmp[i] = seq.s[i-npad];
2422         len = seq.l - i + npad;    // the remaining length of padded sseq
2423 #if DBG>1
2424         fprintf(stderr,"\t i=%d\n", i);
2425 #endif
2426         if ( i==3 )
2427         {
2428             kputc_(dna2aa(tmp), tseq);
2429 #if DBG>1
2430             fprintf(stderr,"[1]%c%c%c\n",tmp[0],tmp[1],tmp[2]);
2431 #endif
2432             codon = seq.s + 3 - npad;        // next codon
2433             end   = codon + len - 1 - (len % 3);    // last position of a valid codon
2434             while ( codon < end )
2435             {
2436                 kputc_(dna2aa(codon), tseq);
2437 #if DBG>1
2438                 fprintf(stderr,"[2]%c%c%c\n",codon[0],codon[1],codon[2]);
2439 #endif
2440                 codon += 3;
2441             }
2442             end = seq.s + seq.l - 1;
2443             for (i=0; codon+i<=end; i++) tmp[i] = codon[i];
2444         }
2445 
2446         // right padding
2447         codon = ref.s + rend + N_REF_PAD;
2448         if ( i>0 )
2449         {
2450 #if DBG>1
2451             if(i==1)fprintf(stderr,"[3]%c\n",tmp[0]);
2452             if(i==2)fprintf(stderr,"[3]%c%c\n",tmp[0],tmp[1]);
2453 #endif
2454             for (; i<3; i++)
2455             {
2456                 tmp[i] = *codon;
2457                 codon++;
2458             }
2459             kputc_(dna2aa(tmp), tseq);
2460 #if DBG>1
2461             fprintf(stderr,"[4]%c%c%c\n",tmp[0],tmp[1],tmp[2]);
2462 #endif
2463         }
2464         if ( fill!=0 )
2465         {
2466             end = ref.s + ref.l - N_REF_PAD;
2467             while ( codon+3 <= end )
2468             {
2469                 kputc_(dna2aa(codon), tseq);
2470 #if DBG>1
2471                 fprintf(stderr,"[5]%c%c%c\t%c\n",codon[0],codon[1],codon[2],dna2aa(codon));
2472 #endif
2473                 codon += 3;
2474             }
2475         }
2476     }
2477     else    // STRAND_REV
2478     {
2479         // right padding - number of bases to take from ref
2480         npad = (seq.m - (sbeg + seq.l)) % 3;
2481 #if DBG>1
2482         fprintf(stderr,"    npad: %d\n",npad);
2483 #endif
2484         if ( !(npad>=0 && sbeg+seq.l+npad<=seq.m) ) fprintf(stderr,"sbeg=%d  seq.l=%d seq.m=%d npad=%d\n",sbeg,(int)seq.l,(int)seq.m,npad);
2485         assert( npad>=0 && sbeg+seq.l+npad<=seq.m );  // todo: first codon on the rev strand
2486 
2487         if ( npad==2 )
2488         {
2489             tmp[1] = ref.s[rend+N_REF_PAD];
2490             tmp[2] = ref.s[rend+N_REF_PAD+1];
2491             i = 0;
2492         }
2493         else if ( npad==1 )
2494         {
2495             tmp[2] = ref.s[rend+N_REF_PAD];
2496             i = 1;
2497         }
2498         else
2499             i = 2;
2500 
2501         end = seq.s + seq.l;
2502         for (; i>=0 && end>seq.s; i--) tmp[i] = *(--end);
2503 #if DBG>1
2504         fprintf(stderr,"\t i=%d\n", i);
2505         if(i==1)fprintf(stderr,"[0]  %c\n",tmp[2]);
2506         if(i==0)fprintf(stderr,"[0] %c%c\n",tmp[1],tmp[2]);
2507 #endif
2508         if ( i==-1 )
2509         {
2510 #if DBG>1
2511             fprintf(stderr,"[1]%c%c%c\t%c\n",tmp[0],tmp[1],tmp[2], cdna2aa(tmp));
2512 #endif
2513             kputc_(cdna2aa(tmp), tseq);
2514             codon = end - 3;
2515             while ( codon >= seq.s )
2516             {
2517                 kputc_(cdna2aa(codon), tseq);
2518 #if DBG>1
2519                 fprintf(stderr,"[2]%c%c%c\t%c\n",codon[0],codon[1],codon[2], cdna2aa(codon));
2520 #endif
2521                 codon -= 3;
2522             }
2523             if ( seq.s-codon==2 )
2524             {
2525                 tmp[2] = seq.s[0];
2526                 i = 1;
2527             }
2528             else if ( seq.s-codon==1 )
2529             {
2530                 tmp[1] = seq.s[0];
2531                 tmp[2] = seq.s[1];
2532                 i = 0;
2533             }
2534             else
2535                 i = -1;
2536 #if DBG>1
2537             if(i==1)fprintf(stderr,"[3]   %c\n",tmp[2]);
2538             if(i==0)fprintf(stderr,"[3] %c%c\n",tmp[1],tmp[2]);
2539 #endif
2540         }
2541         // left padding
2542         end = ref.s + N_REF_PAD + rbeg;
2543         if ( i>=0 )
2544         {
2545             for (; i>=0 && end>=ref.s; i--) tmp[i] = *(--end);
2546             kputc_(cdna2aa(tmp), tseq);
2547 #if DBG>1
2548             fprintf(stderr,"[4]%c%c%c\t%c\n",tmp[0],tmp[1],tmp[2],cdna2aa(tmp));
2549 #endif
2550         }
2551         if ( fill!=0 )
2552         {
2553             codon = end - 3;
2554             while ( codon >= ref.s + N_REF_PAD )
2555             {
2556                 kputc_(cdna2aa(codon), tseq);
2557 #if DBG>1
2558                 fprintf(stderr,"[5]%c%c%c\t%c\n",codon[0],codon[1],codon[2],cdna2aa(codon));
2559 #endif
2560                 codon -= 3;
2561             }
2562         }
2563     }
2564     kputc_(0,tseq); tseq->l--;
2565 #if DBG
2566  fprintf(stderr,"    tseq: %s\n", tseq->s);
2567 #endif
2568 }
2569 
tscript_splice_ref(tscript_t * tr)2570 void tscript_splice_ref(tscript_t *tr)
2571 {
2572     int i, len = 0;
2573     for (i=0; i<tr->ncds; i++)
2574         len += tr->cds[i]->len;
2575 
2576     tr->nsref = len + 2*N_REF_PAD;
2577     tr->sref  = (char*) malloc(len + 1 + 2*N_REF_PAD);
2578     len = 0;
2579 
2580     memcpy(tr->sref, tr->ref + tr->cds[0]->beg - tr->beg, N_REF_PAD);
2581     len += N_REF_PAD;
2582 
2583     for (i=0; i<tr->ncds; i++)
2584     {
2585         memcpy(tr->sref + len, tr->ref + N_REF_PAD + tr->cds[i]->beg - tr->beg, tr->cds[i]->len);
2586         len += tr->cds[i]->len;
2587     }
2588     memcpy(tr->sref + len, tr->ref + N_REF_PAD + tr->cds[tr->ncds-1]->beg - tr->beg, N_REF_PAD);
2589     len += N_REF_PAD;
2590 
2591     tr->sref[len] = 0;
2592 }
2593 
2594 // returns: 0 if consequence was added, 1 if it already exists or could not be added
csq_push(args_t * args,csq_t * csq,bcf1_t * rec)2595 int csq_push(args_t *args, csq_t *csq, bcf1_t *rec)
2596 {
2597 #if XDBG
2598 fprintf(stderr,"csq_push: %d .. %d\n",rec->pos+1,csq->type.type);
2599 #endif
2600     khint_t k = kh_get(pos2vbuf, args->pos2vbuf, (int)csq->pos);
2601     vbuf_t *vbuf = (k == kh_end(args->pos2vbuf)) ? NULL : kh_val(args->pos2vbuf, k);
2602     if ( !vbuf ) error("This should not happen. %s:%d  %s\n",bcf_seqname(args->hdr,rec),csq->pos+1,csq->type.vstr.s);
2603 
2604     int i;
2605     for (i=0; i<vbuf->n; i++)
2606         if ( vbuf->vrec[i]->line==rec ) break;
2607     if ( i==vbuf->n ) error("This should not happen.. %s:%d  %s\n", bcf_seqname(args->hdr,rec),csq->pos+1,csq->type.vstr.s);
2608     vrec_t *vrec = vbuf->vrec[i];
2609 
2610     // if the variant overlaps donor/acceptor and also splice region, report only donor/acceptor
2611     if ( csq->type.type & CSQ_SPLICE_REGION && csq->type.type & (CSQ_SPLICE_DONOR|CSQ_SPLICE_ACCEPTOR) )
2612         csq->type.type &= ~CSQ_SPLICE_REGION;
2613 
2614     if ( csq->type.type & CSQ_PRINTED_UPSTREAM )
2615     {
2616         for (i=0; i<vrec->nvcsq; i++)
2617         {
2618             // Same as below, to avoid records like
2619             //      3630 .. @3632,stop_lost|AL627309.1|ENST00000423372|protein_coding|-
2620             //      3632 .. stop_lost|AL627309.1|ENST00000423372|protein_coding|-|260*>260G|3630T>A+3632A>C
2621             if ( csq->type.type&CSQ_START_STOP && vrec->vcsq[i].type&CSQ_START_STOP )
2622             {
2623                 vrec->vcsq[i] = csq->type;
2624                 goto exit_duplicate;
2625             }
2626             if ( !(vrec->vcsq[i].type & CSQ_PRINTED_UPSTREAM) ) continue;
2627             if ( csq->type.ref != vrec->vcsq[i].ref ) continue;
2628             goto exit_duplicate;
2629         }
2630     }
2631     else if ( csq->type.type & CSQ_COMPOUND )
2632     {
2633         for (i=0; i<vrec->nvcsq; i++)
2634         {
2635             if ( csq->type.trid != vrec->vcsq[i].trid && (csq->type.type|vrec->vcsq[i].type)&CSQ_PRN_TSCRIPT ) continue;
2636             if ( csq->type.biotype != vrec->vcsq[i].biotype ) continue;
2637             if ( csq->type.gene != vrec->vcsq[i].gene ) continue;
2638             if ( csq->type.vcf_ial != vrec->vcsq[i].vcf_ial ) continue;
2639             if ( (csq->type.type&CSQ_UPSTREAM_STOP)^(vrec->vcsq[i].type&CSQ_UPSTREAM_STOP) ) continue;  // both must or mustn't have upstream_stop
2640             if ( csq->type.vstr.s || vrec->vcsq[i].vstr.s )
2641             {
2642                 // This is a bit hacky, but we want a simpler and more predictable output. The splice_csq() function
2643                 // can trigger stop/start events based on indel overlap, then another stop/start event can be triggered
2644                 // from add_csq() or test_cds_local() based on sequence comparison, and on output we could find two
2645                 // consequences:
2646                 //      stop_lost|AL627309.1|ENST00000423372|protein_coding|-
2647                 //      stop_lost&inframe_insertion|AL627309.1|ENST00000423372|protein_coding|-|260*>260CL|3630T>TAAA
2648                 if ( !csq->type.vstr.s || !vrec->vcsq[i].vstr.s )
2649                 {
2650                     if ( csq->type.type&CSQ_START_STOP && vrec->vcsq[i].type&CSQ_START_STOP )
2651                     {
2652                         vrec->vcsq[i].type |= csq->type.type;
2653 
2654                         // remove stop_lost&synonymous if stop_retained set
2655                         if ( vrec->vcsq[i].type&CSQ_STOP_RETAINED )
2656                             vrec->vcsq[i].type &= ~(CSQ_STOP_LOST|CSQ_SYNONYMOUS_VARIANT);
2657 
2658                         if ( !vrec->vcsq[i].vstr.s ) vrec->vcsq[i].vstr = csq->type.vstr;
2659                         goto exit_duplicate;
2660                     }
2661                     continue;
2662                 }
2663                 if ( strcmp(csq->type.vstr.s,vrec->vcsq[i].vstr.s) ) continue;
2664             }
2665             vrec->vcsq[i].type |= csq->type.type;
2666             goto exit_duplicate;
2667         }
2668     }
2669     else
2670     {
2671         for (i=0; i<vrec->nvcsq; i++)
2672         {
2673             if ( csq->type.trid != vrec->vcsq[i].trid && (csq->type.type|vrec->vcsq[i].type)&CSQ_PRN_TSCRIPT) continue;
2674             if ( csq->type.biotype != vrec->vcsq[i].biotype ) continue;
2675             if ( !(vrec->vcsq[i].type & CSQ_COMPOUND) )
2676             {
2677                 vrec->vcsq[i].type |= csq->type.type;
2678                 goto exit_duplicate;
2679             }
2680             if ( vrec->vcsq[i].type==(vrec->vcsq[i].type|csq->type.type) ) goto exit_duplicate;
2681         }
2682     }
2683     // no such csq yet in this vcf record
2684     csq->vrec = vrec;
2685     csq->idx  = vrec->nvcsq++;
2686     hts_expand0(vcsq_t, vrec->nvcsq, vrec->mvcsq, vrec->vcsq);
2687     vrec->vcsq[i] = csq->type;
2688     return 0;
2689 
2690 exit_duplicate:
2691     csq->vrec = vrec;
2692     csq->idx  = i;
2693     return 1;
2694 }
2695 
2696 //  soff .. position of the variant within the trimmed query transcript
2697 //  sbeg .. position of the variant within the query transcript
2698 //  rbeg .. position on the reference transcript (if there are no indels, then rbeg=send)
2699 //  rpos .. VCF position
2700 #define node2soff(i) (hap->stack[i].slen - (hap->stack[i].node->rlen + hap->stack[i].node->dlen))
2701 #define node2sbeg(i) (hap->sbeg + node2soff(i))
2702 #define node2send(i) (hap->sbeg + hap->stack[i].slen)
2703 #define node2rbeg(i) (hap->stack[i].node->sbeg)
2704 #define node2rend(i) (hap->stack[i].node->sbeg + hap->stack[i].node->rlen)
2705 #define node2rpos(i) (hap->stack[i].node->rec->pos)
2706 
kput_vcsq(args_t * args,vcsq_t * csq,kstring_t * str)2707 void kput_vcsq(args_t *args, vcsq_t *csq, kstring_t *str)
2708 {
2709     // Remove start/stop from incomplete CDS, but only if there is another
2710     // consequence as something must be reported
2711     if ( csq->type & CSQ_INCOMPLETE_CDS && (csq->type & ~(CSQ_START_STOP|CSQ_INCOMPLETE_CDS|CSQ_UPSTREAM_STOP)) ) csq->type &= ~(CSQ_START_STOP|CSQ_INCOMPLETE_CDS);
2712 
2713     // Remove missense from start/stops
2714     if ( csq->type & CSQ_START_STOP && csq->type & CSQ_MISSENSE_VARIANT ) csq->type &= ~CSQ_MISSENSE_VARIANT;
2715 
2716     if ( csq->type & CSQ_PRINTED_UPSTREAM && csq->ref )
2717     {
2718         kputc_('@',str);
2719         kputw(csq->ref->pos+1, str);
2720         return;
2721     }
2722     if ( csq->type & CSQ_UPSTREAM_STOP )
2723         kputc_('*',str);
2724 
2725     int i, n = sizeof(csq_strings)/sizeof(char*);
2726     for (i=1; i<n; i++)
2727         if ( csq_strings[i] && csq->type&(1<<i) ) { kputs(csq_strings[i],str); break; }
2728     i++;
2729     for (; i<n; i++)
2730         if ( csq_strings[i] && csq->type&(1<<i) ) { kputc_('&',str); kputs(csq_strings[i],str); }
2731 
2732     kputc_('|', str);
2733     if ( csq->gene ) kputs(csq->gene , str);
2734 
2735     kputc_('|', str);
2736     if ( csq->type & CSQ_PRN_TSCRIPT ) kputs(args->tscript_ids.str[csq->trid], str);
2737 
2738     kputc_('|', str);
2739     kputs(gf_type2gff_string(csq->biotype), str);
2740 
2741     if ( CSQ_PRN_STRAND(csq->type) || csq->vstr.l )
2742         kputs(csq->strand==STRAND_FWD ? "|+" : "|-", str);
2743 
2744     if ( csq->vstr.l )
2745         kputs(csq->vstr.s, str);
2746 }
2747 
kprint_aa_prediction(args_t * args,int beg,kstring_t * aa,kstring_t * str)2748 void kprint_aa_prediction(args_t *args, int beg, kstring_t *aa, kstring_t *str)
2749 {
2750     if ( !args->brief_predictions || (int)aa->l - args->brief_predictions < 3 )
2751         kputs(aa->s, str);
2752     else
2753     {
2754         int i, len = aa->l;
2755         if ( aa->s[len-1]=='*' ) len--;
2756         for (i=0; i<len && i<args->brief_predictions; i++) kputc(aa->s[i], str);
2757         kputs("..", str);
2758         kputw(beg+len, str);
2759     }
2760 }
2761 
hap_add_csq(args_t * args,hap_t * hap,hap_node_t * node,int tlen,int ibeg,int iend,int dlen,int indel)2762 void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg, int iend, int dlen, int indel)
2763 {
2764     int i;
2765     tscript_t *tr = hap->tr;
2766     int ref_node = tr->strand==STRAND_FWD ? ibeg : iend;
2767     int icsq = node->ncsq_list++;
2768     hts_expand0(csq_t,node->ncsq_list,node->mcsq_list,node->csq_list);
2769     csq_t *csq = &node->csq_list[icsq];
2770     csq->pos  = hap->stack[ref_node].node->rec->pos;
2771     csq->type.trid    = tr->id;
2772     csq->type.vcf_ial = node->vcf_ial;
2773     csq->type.gene    = tr->gene->name;
2774     csq->type.strand  = tr->strand;
2775     csq->type.biotype = tr->type;
2776 
2777     // only now we see the translated sequence and can determine if the stop/start changes are real
2778     int rm_csq = 0;
2779     csq->type.type = 0;
2780     for (i=ibeg; i<=iend; i++)
2781         csq->type.type |= hap->stack[i].node->csq & CSQ_COMPOUND;
2782     if ( dlen==0 && indel ) csq->type.type |= CSQ_INFRAME_ALTERING;
2783 
2784     int has_upstream_stop = hap->upstream_stop;
2785     if ( hap->stack[ibeg].node->type != HAP_SSS )
2786     {
2787         // check for truncating stops
2788         for (i=0; i<hap->tref.l; i++)
2789             if ( hap->tref.s[i]=='*' ) break;
2790         if ( i!=hap->tref.l )
2791         {
2792             hap->tref.l = i+1;
2793             hap->tref.s[i+1] = 0;
2794         }
2795         for (i=0; i<hap->tseq.l; i++)
2796             if ( hap->tseq.s[i]=='*' ) break;
2797         if ( i!=hap->tseq.l )
2798         {
2799             hap->tseq.l = i+1;
2800             hap->tseq.s[i+1] = 0;
2801             hap->upstream_stop = 1;
2802         }
2803         if ( csq->type.type & CSQ_STOP_LOST )
2804         {
2805             if ( hap->tref.s[hap->tref.l-1]=='*' && hap->tref.s[hap->tref.l-1] == hap->tseq.s[hap->tseq.l-1] )
2806             {
2807                 rm_csq |= CSQ_STOP_LOST;
2808                 csq->type.type |= CSQ_STOP_RETAINED;
2809             }
2810             else if ( hap->tref.s[hap->tref.l-1]!='*' )
2811             {
2812                 // This is CDS 3' incomplete ENSG00000173376/synon.vcf, can also be missense
2813                 // We observe in real data a change to a stop, ENST00000528237/retained-stop-incomplete-cds.vcf
2814                 if ( hap->tseq.s[hap->tseq.l-1] == '*' )
2815                 {
2816                     rm_csq |= CSQ_STOP_GAINED;
2817                     csq->type.type |= CSQ_STOP_RETAINED;
2818                 }
2819                 else
2820                     csq->type.type |= CSQ_INCOMPLETE_CDS;
2821             }
2822         }
2823         if ( csq->type.type & CSQ_START_LOST && hap->tref.s[0]!='M' )
2824         {
2825             rm_csq |= CSQ_START_LOST;
2826             csq->type.type &= ~CSQ_START_LOST;
2827         }
2828         if ( dlen!=0 )
2829         {
2830             if ( dlen%3 )
2831                 csq->type.type |= CSQ_FRAMESHIFT_VARIANT;
2832             else if ( dlen<0 )
2833                 csq->type.type |= CSQ_INFRAME_DELETION;
2834             else
2835                 csq->type.type |= CSQ_INFRAME_INSERTION;
2836             if ( hap->tref.s[hap->tref.l-1]!='*' && hap->tseq.s[hap->tseq.l-1]=='*' )
2837                 csq->type.type |= CSQ_STOP_GAINED;
2838         }
2839         else
2840         {
2841             for (i=0; i<hap->tref.l; i++)
2842                 if ( hap->tref.s[i] != hap->tseq.s[i] ) break;
2843             if ( i==hap->tref.l )
2844                 csq->type.type |= CSQ_SYNONYMOUS_VARIANT;
2845             else if ( hap->tref.s[i] ==  '*' )
2846                 csq->type.type |= CSQ_STOP_LOST;
2847             else if ( hap->tseq.s[i] ==  '*' )
2848                 csq->type.type |= CSQ_STOP_GAINED;
2849             else
2850                 csq->type.type |= CSQ_MISSENSE_VARIANT;
2851         }
2852     }
2853     // Check if compound inframe variants are real inframes, or if the stop codon occurs before the frameshift can be restored
2854     if ( ibeg!=iend && (csq->type.type & (CSQ_INFRAME_DELETION|CSQ_INFRAME_INSERTION|CSQ_INFRAME_ALTERING)) && hap->tseq.s[hap->tseq.l-1]=='*' )
2855     {
2856         rm_csq |= CSQ_INFRAME_DELETION | CSQ_INFRAME_INSERTION | CSQ_INFRAME_ALTERING;
2857         csq->type.type |= CSQ_FRAMESHIFT_VARIANT | CSQ_STOP_GAINED;
2858     }
2859     if ( has_upstream_stop ) csq->type.type |= CSQ_UPSTREAM_STOP;
2860     csq->type.type &= ~rm_csq;
2861 
2862     if ( hap->stack[ibeg].node->type == HAP_SSS  )
2863     {
2864         node->csq_list[icsq].type.type   |= hap->stack[ibeg].node->csq & ~rm_csq;
2865         node->csq_list[icsq].type.ref     = hap->stack[ibeg].node->rec;
2866         node->csq_list[icsq].type.biotype = tr->type;
2867         csq_push(args, node->csq_list+icsq, hap->stack[ibeg].node->rec);
2868         return;
2869     }
2870 
2871     kstring_t str = node->csq_list[icsq].type.vstr;
2872     str.l = 0;
2873 
2874     // create the aa variant string
2875     int aa_rbeg = tr->strand==STRAND_FWD ? node2rbeg(ibeg)/3+1 : (hap->tr->nsref - 2*N_REF_PAD - node2rend(iend))/3+1;
2876     int aa_sbeg = tr->strand==STRAND_FWD ? node2sbeg(ibeg)/3+1 : (tlen - node2send(iend))/3+1;
2877     kputc_('|', &str);
2878     kputw(aa_rbeg, &str);
2879     kprint_aa_prediction(args,aa_rbeg,&hap->tref,&str);
2880     if ( !(csq->type.type & CSQ_SYNONYMOUS_VARIANT) )
2881     {
2882         kputc_('>', &str);
2883         kputw(aa_sbeg, &str);
2884         kprint_aa_prediction(args,aa_sbeg,&hap->tseq,&str);
2885     }
2886     kputc_('|', &str);
2887 
2888     // create the dna variant string and, in case of combined variants,
2889     // insert silent CSQ_PRINTED_UPSTREAM variants
2890     for (i=ibeg; i<=iend; i++)
2891     {
2892         if ( i>ibeg ) kputc_('+', &str);
2893         kputw(node2rpos(i)+1, &str);
2894         kputs(hap->stack[i].node->var, &str);
2895     }
2896     node->csq_list[icsq].type.vstr = str;
2897     csq_push(args, node->csq_list+icsq, hap->stack[ref_node].node->rec);
2898 
2899     for (i=ibeg; i<=iend; i++)
2900     {
2901         // csq are printed at one position only for combined variants, the rest is
2902         // silent and references the first
2903         if ( hap->stack[i].node->csq & ~CSQ_COMPOUND )
2904         {
2905             node->ncsq_list++;
2906             hts_expand0(csq_t,node->ncsq_list,node->mcsq_list,node->csq_list);
2907             csq_t *tmp_csq = &node->csq_list[node->ncsq_list - 1];
2908             tmp_csq->pos  = hap->stack[i].node->rec->pos;
2909             tmp_csq->type.trid    = tr->id;
2910             //??tmp_csq->type.vcf_ial = node->vcf_ial;  .. this should not be needed for non-compound variants
2911             tmp_csq->type.gene    = tr->gene->name;
2912             tmp_csq->type.strand  = tr->strand;
2913             tmp_csq->type.type    = hap->stack[i].node->csq & ~CSQ_COMPOUND & ~rm_csq;
2914             tmp_csq->type.biotype = tr->type;
2915             tmp_csq->type.vstr.l  = 0;
2916             kputs(str.s,&tmp_csq->type.vstr);
2917             csq_push(args, tmp_csq, hap->stack[i].node->rec);
2918         }
2919         if ( i!=ref_node && (node->csq_list[icsq].type.type & CSQ_COMPOUND || !(hap->stack[i].node->csq & ~CSQ_COMPOUND)) )
2920         {
2921             node->ncsq_list++;
2922             hts_expand0(csq_t,node->ncsq_list,node->mcsq_list,node->csq_list);
2923             csq_t *tmp_csq = &node->csq_list[node->ncsq_list - 1];
2924             tmp_csq->pos  = hap->stack[i].node->rec->pos;
2925             tmp_csq->type.trid    = tr->id;
2926             //??tmp_csq->type.vcf_ial = node->vcf_ial;  .. this should not be needed for non-compound variants
2927             tmp_csq->type.gene    = tr->gene->name;
2928             tmp_csq->type.strand  = tr->strand;
2929             tmp_csq->type.type    = CSQ_PRINTED_UPSTREAM | hap->stack[i].node->csq;
2930             tmp_csq->type.biotype = tr->type;
2931             tmp_csq->type.ref     = hap->stack[ref_node].node->rec;
2932             tmp_csq->type.vstr.l  = 0;
2933             csq_push(args, tmp_csq, hap->stack[i].node->rec);
2934         }
2935     }
2936 }
2937 
2938 
hap_finalize(args_t * args,hap_t * hap)2939 void hap_finalize(args_t *args, hap_t *hap)
2940 {
2941     tscript_t *tr = hap->tr;
2942     if ( !tr->sref )
2943         tscript_splice_ref(tr);
2944 
2945     kstring_t sref;
2946     sref.s = tr->sref;
2947     sref.l = tr->nsref;
2948     sref.m = sref.l;
2949 
2950     int istack = 0;
2951     hts_expand(hstack_t,1,hap->mstack,hap->stack);
2952 
2953     hap->sseq.l = 0;
2954     hap->tseq.l = 0;
2955     hap->stack[0].node = tr->root;
2956     hap->stack[0].ichild = -1;
2957     hap->stack[0].slen = 0;
2958     hap->stack[0].dlen = 0;
2959 
2960     while ( istack>=0 )
2961     {
2962         hstack_t *stack  = &hap->stack[istack];
2963         hap_node_t *node = hap->stack[istack].node;
2964         while ( ++hap->stack[istack].ichild < node->nchild )
2965         {
2966             if ( node->child[stack->ichild] ) break;
2967         }
2968         if ( stack->ichild == node->nchild ) { istack--; continue; }
2969 
2970         node = node->child[stack->ichild];
2971 
2972         istack++;
2973         hts_expand(hstack_t,istack+1,hap->mstack,hap->stack);
2974         stack = &hap->stack[istack-1];
2975 
2976         hap->stack[istack].node = node;
2977         hap->stack[istack].ichild = -1;
2978 
2979         hap->sseq.l = stack->slen;
2980         if ( node->type==HAP_CDS ) kputs(node->seq, &hap->sseq);
2981         hap->stack[istack].slen = hap->sseq.l;
2982         hap->stack[istack].dlen = hap->stack[istack-1].dlen + node->dlen;
2983 
2984         if ( !node->nend ) continue;    // not a leaf node
2985 
2986         // The spliced sequence has been built for the current haplotype and stored
2987         // in hap->sseq. Now we break it and output as independent parts
2988 
2989         kstring_t sseq;
2990         sseq.m = sref.m - 2*N_REF_PAD + hap->stack[istack].dlen;  // total length of the spliced query transcript
2991         hap->upstream_stop = 0;
2992 
2993         int i = 1, dlen = 0, ibeg, indel = 0;
2994         hap->sbeg = hap->stack[i].node->sbeg;
2995         assert( hap->stack[istack].node->type != HAP_SSS );
2996         if ( tr->strand==STRAND_FWD )
2997         {
2998             i = 0, ibeg = -1;
2999             while ( ++i <= istack )
3000             {
3001                 assert( hap->stack[i].node->type != HAP_SSS );
3002 
3003                 dlen += hap->stack[i].node->dlen;
3004                 if ( hap->stack[i].node->dlen ) indel = 1;
3005 
3006                 // This condition extends compound variants.
3007                 if ( i<istack )
3008                 {
3009                     if ( dlen%3 )   // frameshift
3010                     {
3011                         if ( ibeg==-1 ) ibeg = i;
3012                         continue;
3013                     }
3014                     // the last base of the current variant vs the first base of the next
3015                     // variant: are they in the same codon? (forward strand)
3016                     int icur  = node2sbeg(i);
3017                     int inext = node2sbeg(i+1);
3018                     if ( hap->stack[i].node->dlen > 0 ) icur += hap->stack[i].node->dlen;
3019                     else if ( hap->stack[i].node->dlen < 0 ) icur++;
3020                     if ( icur/3 == inext/3 )    // in the same codon, can't be flushed yet
3021                     {
3022                         if ( ibeg==-1 ) ibeg = i;
3023                         continue;
3024                     }
3025                 }
3026                 if ( ibeg<0 ) ibeg = i;
3027 
3028                 int ioff = node2soff(ibeg);
3029                 int icur = node2sbeg(ibeg);
3030                 int rbeg = node2rbeg(ibeg);
3031                 int rend = node2rend(i);
3032                 int fill = dlen%3;
3033 
3034                 // alt
3035                 if ( hap->sseq.l )
3036                 {
3037                     sseq.l = hap->stack[i].slen - ioff;
3038                     sseq.s = hap->sseq.s + ioff;
3039                 }
3040                 else    // splice site overlap, see #1475227917
3041                     sseq.l = fill = 0;
3042                 cds_translate(&sref, &sseq, icur,rbeg,rend, tr->strand, &hap->tseq, fill);
3043 
3044                 // ref
3045                 sseq.l = node2rend(i) - rbeg;
3046                 sseq.s = sref.s + N_REF_PAD + rbeg;
3047                 sseq.m = sref.m - 2*N_REF_PAD;
3048                 cds_translate(&sref, &sseq, rbeg,rbeg,rend, tr->strand, &hap->tref, fill);
3049                 sseq.m = sref.m - 2*N_REF_PAD + hap->stack[istack].dlen;
3050 
3051                 hap_add_csq(args,hap,node,0, ibeg,i,dlen,indel);
3052                 ibeg = -1;
3053                 dlen = 0;
3054                 indel = 0;
3055             }
3056         }
3057         else
3058         {
3059             i = istack + 1, ibeg = -1;
3060             while ( --i > 0 )
3061             {
3062                 assert ( hap->stack[i].node->type != HAP_SSS );
3063                 dlen += hap->stack[i].node->dlen;
3064                 if ( hap->stack[i].node->dlen ) indel = 1;
3065                 if ( i>1 )
3066                 {
3067                     if ( dlen%3 )
3068                     {
3069                         if ( ibeg==-1 ) ibeg = i;
3070                         continue;
3071                     }
3072                     // the last base of the current variant vs the first base of the next
3073                     // variant: are they in the same codon? (reverse strand)
3074                     int icur  = sseq.m - 1 - node2sbeg(i);
3075                     int inext = sseq.m - 1 - node2sbeg(i-1);
3076                     if ( hap->stack[i].node->dlen > 0 ) icur += hap->stack[i].node->dlen - 1;
3077                     else if ( hap->stack[i].node->dlen < 0 ) icur -= hap->stack[i].node->dlen;
3078                     if ( hap->stack[i-1].node->dlen > 0 ) inext -= hap->stack[i-1].node->dlen;
3079                     if ( icur/3 == inext/3 )
3080                     {
3081                         if ( ibeg==-1 ) ibeg = i;
3082                         continue;
3083                     }
3084                 }
3085                 if ( ibeg<0 ) ibeg = i;
3086                 int ioff = node2soff(i);
3087                 int icur = node2sbeg(i);
3088                 int rbeg = node2rbeg(i);
3089                 int rend = node2rend(ibeg);
3090                 int fill = dlen%3;
3091 
3092                 // alt
3093                 if ( hap->sseq.l )
3094                 {
3095                     sseq.l = hap->stack[ibeg].slen - ioff;
3096                     sseq.s = hap->sseq.s + ioff;
3097                 }
3098                 else    // splice site overlap, see #1475227917
3099                     sseq.l = fill = 0;
3100                 cds_translate(&sref, &sseq, icur,rbeg,rend, tr->strand, &hap->tseq, fill);
3101 
3102                 // ref
3103                 sseq.l = node2rend(ibeg) - rbeg;
3104                 sseq.s = sref.s + N_REF_PAD + rbeg;
3105                 sseq.m = sref.m - 2*N_REF_PAD;
3106                 cds_translate(&sref, &sseq, rbeg,rbeg,rend, tr->strand, &hap->tref, fill);
3107                 sseq.m = sref.m - 2*N_REF_PAD + hap->stack[istack].dlen;
3108 
3109                 hap_add_csq(args,hap,node,sseq.m, i,ibeg,dlen,indel);
3110                 ibeg = -1;
3111                 dlen = 0;
3112                 indel = 0;
3113             }
3114         }
3115     }
3116 }
3117 
csq_print_text(args_t * args,csq_t * csq,int ismpl,int ihap)3118 static inline void csq_print_text(args_t *args, csq_t *csq, int ismpl, int ihap)
3119 {
3120     if ( csq->type.type & CSQ_PRINTED_UPSTREAM ) return;
3121 
3122     char *smpl = ismpl >= 0 ? args->hdr->samples[ismpl] : "-";
3123     const char *chr = bcf_hdr_id2name(args->hdr,args->rid);
3124 
3125     fprintf(args->out,"CSQ\t%s\t", smpl);
3126     if ( ihap>0 )
3127         fprintf(args->out,"%d", ihap);
3128     else
3129         fprintf(args->out,"-");
3130 
3131     args->str.l = 0;
3132     kput_vcsq(args, &csq->type, &args->str);
3133     fprintf(args->out,"\t%s\t%d\t%s\n",chr,csq->pos+1,args->str.s);
3134 }
hap_print_text(args_t * args,tscript_t * tr,int ismpl,int ihap,hap_node_t * node)3135 static inline void hap_print_text(args_t *args, tscript_t *tr, int ismpl, int ihap, hap_node_t *node)
3136 {
3137     if ( !node || !node->ncsq_list ) return;
3138 
3139     char *smpl = ismpl >= 0 ? args->hdr->samples[ismpl] : "-";
3140     const char *chr = bcf_hdr_id2name(args->hdr,args->rid);
3141 
3142     int i;
3143     for (i=0; i<node->ncsq_list; i++)
3144     {
3145         csq_t *csq = node->csq_list + i;
3146         if ( csq->type.type & CSQ_PRINTED_UPSTREAM ) continue;
3147         assert( csq->type.vstr.l );
3148 
3149         fprintf(args->out,"CSQ\t%s\t", smpl);
3150         if ( ihap>0 )
3151             fprintf(args->out,"%d", ihap);
3152         else
3153             fprintf(args->out,"-");
3154 
3155         args->str.l = 0;
3156         kput_vcsq(args, &csq->type, &args->str);
3157         fprintf(args->out,"\t%s\t%d\t%s\n",chr,csq->pos+1,args->str.s);
3158     }
3159 }
3160 
hap_stage_vcf(args_t * args,tscript_t * tr,int ismpl,int ihap,hap_node_t * node)3161 static inline void hap_stage_vcf(args_t *args, tscript_t *tr, int ismpl, int ihap, hap_node_t *node)
3162 {
3163     if ( !node || !node->ncsq_list || ismpl<0 ) return;
3164 
3165     int i;
3166     for (i=0; i<node->ncsq_list; i++)
3167     {
3168         csq_t *csq = node->csq_list + i;
3169         vrec_t *vrec = csq->vrec;
3170         int icsq2 = 2*csq->idx + ihap;
3171         if ( icsq2 >= args->ncsq2_max ) // more than ncsq2_max consequences, so can't fit it in FMT
3172         {
3173             if ( args->verbosity && (!args->ncsq2_small_warned || args->verbosity > 1) )
3174             {
3175                 fprintf(stderr,
3176                     "Warning: Too many consequences for sample %s at %s:%"PRId64", keeping the first %d and skipping the rest.\n",
3177                     args->hdr->samples[ismpl],bcf_hdr_id2name(args->hdr,args->rid),(int64_t) vrec->line->pos+1,csq->idx);
3178                 if ( !args->ncsq2_small_warned )
3179                     fprintf(stderr,"         The limit can be increased by setting the --ncsq parameter. This warning is printed only once.\n");
3180             }
3181             if ( args->ncsq2_small_warned < icsq2 ) args->ncsq2_small_warned = icsq2;
3182             break;
3183         }
3184         int ival, ibit;
3185         icsq2_to_bit(icsq2, &ival,&ibit);
3186         if ( vrec->nfmt < 1 + ival ) vrec->nfmt = 1 + ival;
3187         vrec->fmt_bm[ismpl*args->nfmt_bcsq + ival] |= 1 << ibit;
3188     }
3189 }
3190 
hap_flush(args_t * args,uint32_t pos)3191 void hap_flush(args_t *args, uint32_t pos)
3192 {
3193     int i,j;
3194     tr_heap_t *heap = args->active_tr;
3195     while ( heap->ndat && heap->dat[0]->end<=pos )
3196     {
3197         tscript_t *tr = heap->dat[0];
3198         khp_delete(trhp, heap);
3199         args->hap->tr = tr;
3200         if ( tr->root && tr->root->nchild ) // normal, non-localized calling
3201         {
3202             hap_finalize(args, args->hap);
3203 
3204             if ( args->output_type==FT_TAB_TEXT )   // plain text output, not a vcf
3205             {
3206                 if ( args->phase==PHASE_DROP_GT )
3207                     hap_print_text(args, tr, -1,0, tr->hap[0]);
3208                 else
3209                 {
3210                     for (i=0; i<args->smpl->n; i++)
3211                     {
3212                         for (j=0; j<2; j++)
3213                             hap_print_text(args, tr, args->smpl->idx[i],j+1, tr->hap[i*2+j]);
3214                     }
3215                 }
3216             }
3217             else if ( args->phase!=PHASE_DROP_GT )
3218             {
3219                 for (i=0; i<args->smpl->n; i++)
3220                 {
3221                     for (j=0; j<2; j++)
3222                         hap_stage_vcf(args, tr, args->smpl->idx[i],j, tr->hap[i*2+j]);
3223                 }
3224             }
3225         }
3226 
3227         // mark the transcript for deletion. Cannot delete it immediately because
3228         // by-position VCF output will need them when flushed by vcf_buf_push
3229         args->nrm_tr++;
3230         hts_expand(tscript_t*,args->nrm_tr,args->mrm_tr,args->rm_tr);
3231         args->rm_tr[args->nrm_tr-1] = tr;
3232     }
3233 }
3234 
3235 #define SWAP(type_t, a, b) { type_t t = a; a = b; b = t; }
3236 
vbuf_push(args_t * args,bcf1_t ** rec_ptr)3237 vbuf_t *vbuf_push(args_t *args, bcf1_t **rec_ptr)
3238 {
3239     int i;
3240 
3241     assert(rec_ptr);
3242     bcf1_t *rec = *rec_ptr;
3243 
3244     // check for duplicate records
3245     i = args->vcf_rbuf.n ? rbuf_last(&args->vcf_rbuf) : -1;
3246     if ( i<0 || args->vcf_buf[i]->vrec[0]->line->pos!=rec->pos )
3247     {
3248         // vcf record with a new pos
3249         rbuf_expand0(&args->vcf_rbuf, vbuf_t*, args->vcf_rbuf.n+1, args->vcf_buf);
3250         i = rbuf_append(&args->vcf_rbuf);
3251         if ( !args->vcf_buf[i] ) args->vcf_buf[i] = (vbuf_t*) calloc(1,sizeof(vbuf_t));
3252         args->vcf_buf[i]->n = 0;
3253         args->vcf_buf[i]->keep_until = 0;
3254     }
3255     vbuf_t *vbuf = args->vcf_buf[i];
3256     vbuf->n++;
3257     hts_expand0(vrec_t*, vbuf->n, vbuf->m, vbuf->vrec);
3258     if ( !vbuf->vrec[vbuf->n - 1] )
3259         vbuf->vrec[vbuf->n - 1] = (vrec_t*) calloc(1,sizeof(vrec_t));
3260 
3261     vrec_t *vrec = vbuf->vrec[vbuf->n - 1];
3262     if ( args->phase!=PHASE_DROP_GT && args->smpl->n )
3263     {
3264         if ( !vrec->fmt_bm ) vrec->fmt_bm = (uint32_t*) calloc(args->hdr_nsmpl,sizeof(*vrec->fmt_bm) * args->nfmt_bcsq);
3265         else memset(vrec->fmt_bm,0,args->hdr_nsmpl*sizeof(*vrec->fmt_bm) * args->nfmt_bcsq);
3266     }
3267     if ( !vrec->line ) vrec->line = bcf_init1();
3268     SWAP(bcf1_t*, (*rec_ptr), vrec->line);
3269 
3270     int ret;
3271     khint_t k = kh_put(pos2vbuf, args->pos2vbuf, (int)rec->pos, &ret);
3272     kh_val(args->pos2vbuf,k) = vbuf;
3273 
3274     return vbuf;
3275 }
3276 
vbuf_flush(args_t * args,uint32_t pos)3277 void vbuf_flush(args_t *args, uint32_t pos)
3278 {
3279     int i,j;
3280     while ( args->vcf_rbuf.n )
3281     {
3282         vbuf_t *vbuf;
3283         if ( !args->local_csq && args->active_tr->ndat )
3284         {
3285             // check if the first active transcript starts beyond the first buffered VCF record,
3286             // cannot output buffered VCF lines (args.vbuf) until the active transcripts are gone
3287             vbuf = args->vcf_buf[ args->vcf_rbuf.f ];
3288             if ( vbuf->keep_until > pos ) break;
3289             assert( vbuf->n );
3290         }
3291 
3292         i = rbuf_shift(&args->vcf_rbuf);
3293         assert( i>=0 );
3294         vbuf = args->vcf_buf[i];
3295         int pos = vbuf->n ? vbuf->vrec[0]->line->pos : -1;
3296         for (i=0; i<vbuf->n; i++)
3297         {
3298             vrec_t *vrec = vbuf->vrec[i];
3299             if ( !args->out_fh ) // not a VCF output
3300             {
3301                 vrec->nvcsq = 0;
3302                 continue;
3303             }
3304             if ( !vrec->nvcsq )
3305             {
3306                 if ( bcf_write(args->out_fh, args->hdr, vrec->line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname?args->output_fname:"standard output");
3307                 int save_pos = vrec->line->pos;
3308                 bcf_empty(vrec->line);
3309                 vrec->line->pos = save_pos;  // this is necessary for compound variants
3310                 continue;
3311             }
3312 
3313             args->str.l = 0;
3314             kput_vcsq(args, &vrec->vcsq[0], &args->str);
3315             for (j=1; j<vrec->nvcsq; j++)
3316             {
3317                 kputc_(',', &args->str);
3318                 kput_vcsq(args, &vrec->vcsq[j], &args->str);
3319             }
3320             bcf_update_info_string(args->hdr, vrec->line, args->bcsq_tag, args->str.s);
3321             if ( args->hdr_nsmpl )
3322             {
3323                 if ( vrec->nfmt < args->nfmt_bcsq )
3324                     for (j=1; j<args->hdr_nsmpl; j++)
3325                         memmove(&vrec->fmt_bm[j*vrec->nfmt], &vrec->fmt_bm[j*args->nfmt_bcsq], vrec->nfmt*sizeof(*vrec->fmt_bm));
3326                 bcf_update_format_int32(args->hdr, vrec->line, args->bcsq_tag, vrec->fmt_bm, args->hdr_nsmpl*vrec->nfmt);
3327             }
3328             vrec->nvcsq = 0;
3329             if ( bcf_write(args->out_fh, args->hdr, vrec->line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname?args->output_fname:"standard output");
3330             int save_pos = vrec->line->pos;
3331             bcf_empty(vrec->line);
3332             vrec->line->pos = save_pos;
3333         }
3334         if ( pos!=-1 )
3335         {
3336             khint_t k = kh_get(pos2vbuf, args->pos2vbuf, pos);
3337             if ( k != kh_end(args->pos2vbuf) ) kh_del(pos2vbuf, args->pos2vbuf, k);
3338         }
3339         vbuf->n = 0;
3340     }
3341     if ( args->active_tr->ndat ) return;
3342 
3343     for (i=0; i<args->nrm_tr; i++)
3344     {
3345         tscript_t *tr = args->rm_tr[i];
3346         if ( tr->root ) hap_destroy(tr->root);
3347         tr->root = NULL;
3348         free(tr->hap);
3349         free(tr->ref);
3350         free(tr->sref);
3351     }
3352     args->nrm_tr = 0;
3353     args->ncsq_buf = 0;
3354 }
3355 
tscript_init_ref(args_t * args,tscript_t * tr,const char * chr)3356 void tscript_init_ref(args_t *args, tscript_t *tr, const char *chr)
3357 {
3358     int i, len;
3359     int pad_beg = tr->beg >= N_REF_PAD ? N_REF_PAD : tr->beg;
3360 
3361     tr->ref = faidx_fetch_seq(args->fai, chr, tr->beg - pad_beg, tr->end + N_REF_PAD, &len);
3362     if ( !tr->ref )
3363         error("faidx_fetch_seq failed %s:%d-%d\n", chr,tr->beg+1,tr->end+1);
3364 
3365     int pad_end = len - (tr->end - tr->beg + 1 + pad_beg);
3366     if ( pad_beg + pad_end != 2*N_REF_PAD )
3367     {
3368         char *ref = (char*) malloc(tr->end - tr->beg + 1 + 2*N_REF_PAD + 1);
3369         for (i=0; i < N_REF_PAD - pad_beg; i++) ref[i] = 'N';
3370         memcpy(ref+i, tr->ref, len);
3371         len += i;
3372         for (i=0; i < N_REF_PAD - pad_end; i++) ref[i+len] = 'N';
3373         ref[i+len] = 0;
3374         free(tr->ref);
3375         tr->ref = ref;
3376     }
3377 }
3378 
sanity_check_ref(args_t * args,tscript_t * tr,bcf1_t * rec)3379 static void sanity_check_ref(args_t *args, tscript_t *tr, bcf1_t *rec)
3380 {
3381     int vbeg = 0;
3382     int rbeg = rec->pos - tr->beg + N_REF_PAD;
3383     if ( rbeg < 0 ) { vbeg += abs(rbeg); rbeg = 0; }
3384     char *ref = tr->ref + rbeg;
3385     char *vcf = rec->d.allele[0] + vbeg;
3386     assert( vcf - rec->d.allele[0] < strlen(rec->d.allele[0]) && ref - tr->ref < tr->end - tr->beg + 2*N_REF_PAD );
3387     int i = 0;
3388     while ( ref[i] && vcf[i] )
3389     {
3390         if ( ref[i]!=vcf[i] && toupper(ref[i])!=toupper(vcf[i]) )
3391             error("Error: the fasta reference does not match the VCF REF allele at %s:%"PRId64" .. fasta=%c vcf=%c\n",
3392                     bcf_seqname(args->hdr,rec),(int64_t) rec->pos+vbeg+1,ref[i],vcf[i]);
3393         i++;
3394     }
3395 }
3396 
test_cds_local(args_t * args,bcf1_t * rec)3397 int test_cds_local(args_t *args, bcf1_t *rec)
3398 {
3399     int i,j, ret = 0;
3400     const char *chr = bcf_seqname(args->hdr,rec);
3401     // note that the off-by-one extension of rlen is deliberate to account for insertions
3402     if ( !regidx_overlap(args->idx_cds,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0;
3403 
3404     // structures to fake the normal test_cds machinery
3405     hap_node_t root, node;
3406     root.type  = HAP_ROOT;
3407     kstring_t *tref = &args->hap->tref, *tseq = &args->hap->tseq;
3408 
3409     while ( regitr_overlap(args->itr) )
3410     {
3411         gf_cds_t *cds = regitr_payload(args->itr,gf_cds_t*);
3412         tscript_t *tr = cds->tr;
3413         if ( !GF_is_coding(tr->type) ) continue;
3414         ret = 1;
3415 
3416         if ( !tr->ref )
3417         {
3418             tscript_init_ref(args, tr, chr);
3419             tscript_splice_ref(tr);
3420             khp_insert(trhp, args->active_tr, &tr);     // only to clean the reference afterwards
3421         }
3422 
3423         sanity_check_ref(args, tr, rec);
3424 
3425         kstring_t sref;
3426         sref.s = tr->sref;
3427         sref.l = tr->nsref;
3428         sref.m = sref.l;
3429 
3430         for (i=1; i<rec->n_allele; i++)
3431         {
3432             if ( rec->d.allele[i][0]=='<' || rec->d.allele[i][0]=='*' ) { continue; }
3433             if ( hap_init(args, &root, &node, cds, rec, i)!=0 ) continue;
3434 
3435             csq_t csq;
3436             memset(&csq, 0, sizeof(csq_t));
3437             csq.pos          = rec->pos;
3438             csq.type.biotype = tr->type;
3439             csq.type.strand  = tr->strand;
3440             csq.type.trid    = tr->id;
3441             csq.type.vcf_ial = i;
3442             csq.type.gene    = tr->gene->name;
3443 
3444             int csq_type = node.csq;
3445 
3446             // code repetition: it would be nice to reuse the code from hap_add_csq, needs refactoring though
3447             if ( node.type == HAP_SSS )
3448             {
3449                 csq.type.type = csq_type;
3450                 csq_stage(args, &csq, rec);
3451             }
3452             else
3453             {
3454                 kstring_t sseq;
3455                 sseq.m = sref.m - 2*N_REF_PAD + node.dlen;
3456                 sseq.s = node.seq;
3457                 int alen = sseq.l = strlen(sseq.s);
3458                 int fill = node.dlen%3 && alen ? 1 : 0; // see #1475227917
3459                 cds_translate(&sref, &sseq, node.sbeg,node.sbeg,node.sbeg+node.rlen, tr->strand, tseq, fill);
3460 
3461                 sseq.m = sref.m - 2*N_REF_PAD;
3462                 sseq.s = sref.s + N_REF_PAD + node.sbeg;
3463                 sseq.l = node.rlen;
3464                 cds_translate(&sref, &sseq, node.sbeg,node.sbeg,node.sbeg+node.rlen, tr->strand, tref, fill);
3465 
3466                 // check for truncating stops
3467                 for (j=0; j<tref->l; j++)
3468                     if ( tref->s[j]=='*' ) break;
3469                 if ( j!=tref->l )
3470                 {
3471                     tref->l = j+1;
3472                     tref->s[j+1] = 0;
3473                 }
3474                 for (j=0; j<tseq->l; j++)
3475                     if ( tseq->s[j]=='*' ) break;
3476                 if ( j!=tseq->l )
3477                 {
3478                     tseq->l = j+1;
3479                     tseq->s[j+1] = 0;
3480                 }
3481                 if ( csq_type & CSQ_STOP_LOST )
3482                 {
3483                     if ( tref->s[tref->l-1]=='*' && tref->s[tref->l-1] == tseq->s[tseq->l-1] )
3484                     {
3485                         csq_type &= ~CSQ_STOP_LOST;
3486                         csq_type |= CSQ_STOP_RETAINED;
3487                     }
3488                     else if (tref->s[tref->l-1]!='*' )
3489                     {
3490                         // This is CDS 3' incomplete ENSG00000173376/synon.vcf, can also be missense
3491                         // We observe in real data a change to a stop, ENST00000528237/retained-stop-incomplete-cds.vcf
3492                         if ( tseq->s[tseq->l-1] == '*' )
3493                         {
3494                             csq_type &= ~CSQ_STOP_GAINED;
3495                             csq_type |= CSQ_STOP_RETAINED;
3496                         }
3497                         else
3498                             csq_type |= CSQ_INCOMPLETE_CDS;
3499                     }
3500                 }
3501                 if ( csq_type & CSQ_START_LOST && tref->s[0]!='M' )
3502                     csq_type &= ~CSQ_START_LOST;
3503                 if ( node.dlen!=0 )
3504                 {
3505                     if ( node.dlen%3 )
3506                         csq_type |= CSQ_FRAMESHIFT_VARIANT;
3507                     else if ( node.dlen<0 )
3508                         csq_type |= CSQ_INFRAME_DELETION;
3509                     else
3510                         csq_type |= CSQ_INFRAME_INSERTION;
3511                     if ( tref->s[tref->l-1]!='*' && tseq->s[tseq->l-1]=='*' )
3512                         csq_type |= CSQ_STOP_GAINED;
3513                 }
3514                 else
3515                 {
3516                     for (j=0; j<tref->l; j++)
3517                         if ( tref->s[j] != tseq->s[j] ) break;
3518                     if ( j==tref->l )
3519                         csq_type |= CSQ_SYNONYMOUS_VARIANT;
3520                     else if ( tref->s[j] ==  '*' )
3521                         csq_type |= CSQ_STOP_LOST;
3522                     else if ( tseq->s[j] ==  '*' )
3523                         csq_type |= CSQ_STOP_GAINED;
3524                     else
3525                         csq_type |= CSQ_MISSENSE_VARIANT;
3526                 }
3527                 if ( csq_type & CSQ_COMPOUND )
3528                 {
3529                     // create the aa variant string
3530                     kstring_t str = {0,0,0};
3531                     int aa_rbeg = tr->strand==STRAND_FWD ? node.sbeg/3+1 : (tr->nsref - 2*N_REF_PAD - node.sbeg - node.rlen)/3+1;
3532                     int aa_sbeg = tr->strand==STRAND_FWD ? node.sbeg/3+1 : (tr->nsref - 2*N_REF_PAD + node.dlen - node.sbeg - alen)/3+1;
3533                     kputc_('|', &str);
3534                     kputw(aa_rbeg, &str);
3535                     kprint_aa_prediction(args,aa_rbeg,tref,&str);
3536                     if ( !(csq_type & CSQ_SYNONYMOUS_VARIANT) )
3537                     {
3538                         kputc_('>', &str);
3539                         kputw(aa_sbeg, &str);
3540                         kprint_aa_prediction(args,aa_sbeg,tseq,&str);
3541                     }
3542                     kputc_('|', &str);
3543                     kputw(rec->pos+1, &str);
3544                     kputs(node.var, &str);
3545                     csq.type.vstr = str;
3546                     csq.type.type = csq_type & CSQ_COMPOUND;
3547                     csq_stage(args, &csq, rec);
3548 
3549                     // all this only to clean vstr when vrec is flushed
3550                     if ( !tr->root )
3551                         tr->root = (hap_node_t*) calloc(1,sizeof(hap_node_t));
3552                     tr->root->ncsq_list++;
3553                     hts_expand0(csq_t,tr->root->ncsq_list,tr->root->mcsq_list,tr->root->csq_list);
3554                     csq_t *rm_csq = tr->root->csq_list + tr->root->ncsq_list - 1;
3555                     rm_csq->type.vstr = str;
3556                 }
3557                 if ( csq_type & ~CSQ_COMPOUND )
3558                 {
3559                     csq.type.type = csq_type & ~CSQ_COMPOUND;
3560                     csq.type.vstr.l = 0;
3561                     csq_stage(args, &csq, rec);
3562                 }
3563             }
3564             free(node.seq);
3565             free(node.var);
3566         }
3567     }
3568     return ret;
3569 }
3570 
test_cds(args_t * args,bcf1_t * rec,vbuf_t * vbuf)3571 int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf)
3572 {
3573     static int overlaps_warned = 0, multiploid_warned = 0;
3574 
3575     int i, ret = 0, hap_ret;
3576     const char *chr = bcf_seqname(args->hdr,rec);
3577     // note that the off-by-one extension of rlen is deliberate to account for insertions
3578     if ( !regidx_overlap(args->idx_cds,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0;
3579     while ( regitr_overlap(args->itr) )
3580     {
3581         gf_cds_t *cds = regitr_payload(args->itr,gf_cds_t*);
3582         tscript_t *tr = cds->tr;
3583         if ( !GF_is_coding(tr->type) ) continue;
3584         if ( vbuf->keep_until < tr->end ) vbuf->keep_until = tr->end;
3585         ret = 1;
3586         if ( !tr->root )
3587         {
3588             // initialize the transcript and its haplotype tree, fetch the reference sequence
3589             tscript_init_ref(args, tr, chr);
3590 
3591             tr->root = (hap_node_t*) calloc(1,sizeof(hap_node_t));
3592             tr->nhap = args->phase==PHASE_DROP_GT ? 1 : 2*args->smpl->n;     // maximum ploidy = diploid
3593             tr->hap  = (hap_node_t**) malloc(tr->nhap*sizeof(hap_node_t*));
3594             for (i=0; i<tr->nhap; i++) tr->hap[i] = NULL;
3595             tr->root->nend = tr->nhap;
3596             tr->root->type = HAP_ROOT;
3597 
3598             khp_insert(trhp, args->active_tr, &tr);
3599         }
3600 
3601         sanity_check_ref(args, tr, rec);
3602 
3603         if ( args->phase==PHASE_DROP_GT )
3604         {
3605             if ( rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*' ) { continue; }
3606             hap_node_t *parent = tr->hap[0] ? tr->hap[0] : tr->root;
3607             hap_node_t *child  = (hap_node_t*)calloc(1,sizeof(hap_node_t));
3608             hap_ret = hap_init(args, parent, child, cds, rec, 1);
3609             if ( hap_ret!=0 )
3610             {
3611                 // overlapping or intron variant, cannot apply
3612                 if ( hap_ret==1 )
3613                 {
3614                     if ( args->verbosity && (!overlaps_warned || args->verbosity > 1) )
3615                     {
3616                         fprintf(stderr,
3617                             "Warning: Skipping overlapping variants at %s:%"PRId64"\t%s>%s.\n",
3618                             chr,(int64_t) rec->pos+1,rec->d.allele[0],rec->d.allele[1]);
3619                         if ( !overlaps_warned )
3620                             fprintf(stderr,"         This message is printed only once, the verbosity can be increased with `--verbose 2`\n");
3621                         overlaps_warned = 1;
3622                     }
3623                     if ( args->out )
3624                         fprintf(args->out,"LOG\tWarning: Skipping overlapping variants at %s:%"PRId64"\t%s>%s\n", chr,(int64_t) rec->pos+1,rec->d.allele[0],rec->d.allele[1]);
3625                 }
3626                 else ret = 1;   // prevent reporting as intron in test_tscript
3627                 hap_destroy(child);
3628                 continue;
3629             }
3630             if ( child->type==HAP_SSS )
3631             {
3632                 csq_t csq;
3633                 memset(&csq, 0, sizeof(csq_t));
3634                 csq.pos          = rec->pos;
3635                 csq.type.biotype = tr->type;
3636                 csq.type.strand  = tr->strand;
3637                 csq.type.trid    = tr->id;
3638                 csq.type.vcf_ial = 1;
3639                 csq.type.gene    = tr->gene->name;
3640                 csq.type.type = child->csq;
3641                 csq_stage(args, &csq, rec);
3642                 hap_destroy(child);
3643                 ret = 1;
3644                 continue;
3645             }
3646             parent->nend--;
3647             parent->nchild = 1;
3648             parent->mchild = 1;
3649             parent->child  = (hap_node_t**) malloc(sizeof(hap_node_t*));
3650             parent->child[0] = child;
3651             tr->hap[0] = child;
3652             tr->hap[0]->nend = 1;
3653             continue;
3654         }
3655 
3656         // apply the VCF variants and extend the haplotype tree
3657         int j, ismpl, ihap, ngts = bcf_get_genotypes(args->hdr, rec, &args->gt_arr, &args->mgt_arr);
3658         ngts /= bcf_hdr_nsamples(args->hdr);
3659         if ( ngts!=1 && ngts!=2 )
3660         {
3661             if ( args->verbosity && (!multiploid_warned || args->verbosity > 1) )
3662             {
3663                 fprintf(stderr,
3664                     "Warning: Skipping site with non-diploid/non-haploid genotypes at %s:%"PRId64"\t%s>%s.\n",
3665                     chr,(int64_t) rec->pos+1,rec->d.allele[0],rec->d.allele[1]);
3666                 if ( !multiploid_warned )
3667                     fprintf(stderr,"         This message is printed only once, the verbosity can be increased with `--verbose 2`\n");
3668                 multiploid_warned = 1;
3669             }
3670             if ( args->out )
3671                 fprintf(args->out,"LOG\tWarning: Skipping site with non-diploid/non-haploid genotypes at %s:%"PRId64"\t%s>%s\n", chr,(int64_t) rec->pos+1,rec->d.allele[0],rec->d.allele[1]);
3672             continue;
3673         }
3674         for (ismpl=0; ismpl<args->smpl->n; ismpl++)
3675         {
3676             int32_t *gt = args->gt_arr + args->smpl->idx[ismpl]*ngts;
3677             if ( gt[0]==bcf_gt_missing ) continue;
3678 
3679             if ( ngts>1 && gt[1]!=bcf_gt_missing && gt[1]!=bcf_int32_vector_end && bcf_gt_allele(gt[0])!=bcf_gt_allele(gt[1]) )
3680             {
3681                 if ( args->phase==PHASE_MERGE )
3682                 {
3683                     if ( !bcf_gt_allele(gt[0]) ) gt[0] = gt[1];
3684                 }
3685                 if ( !bcf_gt_is_phased(gt[0]) && !bcf_gt_is_phased(gt[1]) )
3686                 {
3687                     if ( args->phase==PHASE_REQUIRE )
3688                         error("Unphased heterozygous genotype at %s:%"PRId64", sample %s. See the --phase option.\n", chr,(int64_t) rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]]);
3689                     if ( args->phase==PHASE_SKIP )
3690                         continue;
3691                     if ( args->phase==PHASE_NON_REF )
3692                     {
3693                         if ( !bcf_gt_allele(gt[0]) ) gt[0] = gt[1];
3694                         else if ( !bcf_gt_allele(gt[1]) ) gt[1] = gt[0];
3695                     }
3696                 }
3697             }
3698 
3699             for (ihap=0; ihap<ngts; ihap++)
3700             {
3701                 if ( gt[ihap]==bcf_gt_missing || gt[ihap]==bcf_int32_vector_end ) continue;
3702 
3703                 i = 2*ismpl + ihap;
3704 
3705                 int ial = bcf_gt_allele(gt[ihap]);
3706                 if ( !ial ) continue;
3707                 assert( ial < rec->n_allele );
3708                 if ( rec->d.allele[ial][0]=='<' || rec->d.allele[ial][0]=='*' ) { continue; }
3709 
3710                 hap_node_t *parent = tr->hap[i] ? tr->hap[i] : tr->root;
3711                 if ( parent->cur_rec==rec && parent->cur_child[ial]>=0 )
3712                 {
3713                     // this haplotype has been seen in another sample
3714                     tr->hap[i] = parent->child[ parent->cur_child[ial] ];
3715                     tr->hap[i]->nend++;
3716                     parent->nend--;
3717                     continue;
3718                 }
3719 
3720                 hap_node_t *child = (hap_node_t*)calloc(1,sizeof(hap_node_t));
3721                 hap_ret = hap_init(args, parent, child, cds, rec, ial);
3722                 if ( hap_ret!=0 )
3723                 {
3724                     // overlapping or intron variant, cannot apply
3725                     if ( hap_ret==1 )
3726                     {
3727                         if ( args->verbosity && (!overlaps_warned || args->verbosity > 1) )
3728                         {
3729                             fprintf(stderr,
3730                                     "Warning: Skipping overlapping variants at %s:%"PRId64", sample %s\t%s>%s.\n",
3731                                     chr,(int64_t) rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]],rec->d.allele[0],rec->d.allele[ial]);
3732                             if ( !overlaps_warned )
3733                                 fprintf(stderr,"         This message is printed only once, the verbosity can be increased with `--verbose 2`\n");
3734                             overlaps_warned = 1;
3735                         }
3736                         if ( args->out  )
3737                             fprintf(args->out,"LOG\tWarning: Skipping overlapping variants at %s:%"PRId64", sample %s\t%s>%s\n",
3738                                     chr,(int64_t) rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]],rec->d.allele[0],rec->d.allele[ial]);
3739                     }
3740                     hap_destroy(child);
3741                     continue;
3742                 }
3743                 if ( child->type==HAP_SSS )
3744                 {
3745                     csq_t csq;
3746                     memset(&csq, 0, sizeof(csq_t));
3747                     csq.pos          = rec->pos;
3748                     csq.type.biotype = tr->type;
3749                     csq.type.strand  = tr->strand;
3750                     csq.type.trid    = tr->id;
3751                     csq.type.vcf_ial = ial;
3752                     csq.type.gene    = tr->gene->name;
3753                     csq.type.type = child->csq;
3754                     csq_stage(args, &csq, rec);
3755                     hap_destroy(child);
3756                     continue;
3757                 }
3758                 if ( parent->cur_rec!=rec )
3759                 {
3760                     hts_expand(int,rec->n_allele,parent->mcur_child,parent->cur_child);
3761                     for (j=0; j<rec->n_allele; j++) parent->cur_child[j] = -1;
3762                     parent->cur_rec = rec;
3763                 }
3764 
3765                 j = parent->nchild++;
3766                 hts_expand0(hap_node_t*,parent->nchild,parent->mchild,parent->child);
3767                 parent->cur_child[ial] = j;
3768                 parent->child[j] = child;
3769                 tr->hap[i] = child;
3770                 tr->hap[i]->nend++;
3771                 parent->nend--;
3772             }
3773         }
3774     }
3775     return ret;
3776 }
3777 
csq_stage(args_t * args,csq_t * csq,bcf1_t * rec)3778 void csq_stage(args_t *args, csq_t *csq, bcf1_t *rec)
3779 {
3780     // known issues: tab output leads to unsorted output. This is because
3781     // coding haplotypes are printed in one go and buffering is not used
3782     // with tab output. VCF output is OK though.
3783     if ( csq_push(args, csq, rec)!=0 && args->phase==PHASE_DROP_GT ) return;    // the consequence already exists
3784 
3785     int i,j,ngt = 0;
3786     if ( args->phase!=PHASE_DROP_GT )
3787     {
3788         ngt = bcf_get_genotypes(args->hdr, rec, &args->gt_arr, &args->mgt_arr);
3789         if ( ngt>0 ) ngt /= bcf_hdr_nsamples(args->hdr);
3790     }
3791     if ( ngt<=0 )
3792     {
3793         if ( args->output_type==FT_TAB_TEXT )
3794             csq_print_text(args, csq, -1,0);
3795         return;
3796     }
3797     assert( ngt<=2 );
3798 
3799     if ( args->output_type==FT_TAB_TEXT )
3800     {
3801         for (i=0; i<args->smpl->n; i++)
3802         {
3803             int32_t *gt = args->gt_arr + args->smpl->idx[i]*ngt;
3804             for (j=0; j<ngt; j++)
3805             {
3806                 if ( gt[j]==bcf_gt_missing || gt[j]==bcf_int32_vector_end ) continue;
3807                 int ial = bcf_gt_allele(gt[j]);
3808                 if ( !ial || ial!=csq->type.vcf_ial ) continue;
3809                 csq_print_text(args, csq, args->smpl->idx[i],j+1);
3810             }
3811         }
3812         return;
3813     }
3814 
3815     vrec_t *vrec = csq->vrec;
3816     for (i=0; i<args->smpl->n; i++)
3817     {
3818         int32_t *gt = args->gt_arr + args->smpl->idx[i]*ngt;
3819         for (j=0; j<ngt; j++)
3820         {
3821             if ( gt[j]==bcf_gt_missing || gt[j]==bcf_int32_vector_end ) continue;
3822             int ial = bcf_gt_allele(gt[j]);
3823             if ( !ial || ial!=csq->type.vcf_ial ) continue;
3824 
3825             int icsq2 = 2*csq->idx + j;
3826             if ( icsq2 >= args->ncsq2_max ) // more than ncsq_max consequences, so can't fit it in FMT
3827             {
3828                 int ismpl = args->smpl->idx[i];
3829                 if ( args->verbosity && (!args->ncsq2_small_warned || args->verbosity > 1) )
3830                 {
3831                     fprintf(stderr,
3832                             "Warning: Too many consequences for sample %s at %s:%"PRId64", keeping the first %d and skipping the rest.\n",
3833                             args->hdr->samples[ismpl],bcf_hdr_id2name(args->hdr,args->rid),(int64_t) vrec->line->pos+1,icsq2+1);
3834                     if ( !args->ncsq2_small_warned )
3835                         fprintf(stderr,"         The limit can be increased by setting the --ncsq parameter. This warning is printed only once.\n");
3836                     args->ncsq2_small_warned = 1;
3837                 }
3838                 if ( args->ncsq2_small_warned < icsq2 ) args->ncsq2_small_warned = icsq2;
3839                 break;
3840             }
3841             int ival, ibit;
3842             icsq2_to_bit(icsq2, &ival,&ibit);
3843             if ( vrec->nfmt < 1 + ival ) vrec->nfmt = 1 + ival;
3844             vrec->fmt_bm[i*args->nfmt_bcsq + ival] |= 1 << ibit;
3845         }
3846     }
3847 }
test_utr(args_t * args,bcf1_t * rec)3848 int test_utr(args_t *args, bcf1_t *rec)
3849 {
3850     const char *chr = bcf_seqname(args->hdr,rec);
3851     // note that the off-by-one extension of rlen is deliberate to account for insertions
3852     if ( !regidx_overlap(args->idx_utr,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0;
3853 
3854     splice_t splice;
3855     splice_init(&splice, rec);
3856 
3857     int i, ret = 0;
3858     while ( regitr_overlap(args->itr) )
3859     {
3860         gf_utr_t *utr = regitr_payload(args->itr, gf_utr_t*);
3861         tscript_t *tr = splice.tr = utr->tr;
3862         for (i=1; i<rec->n_allele; i++)
3863         {
3864             if ( rec->d.allele[i][0]=='<' || rec->d.allele[i][0]=='*' ) { continue; }
3865             splice.vcf.alt = rec->d.allele[i];
3866             splice.csq     = 0;
3867             int splice_ret = splice_csq(args, &splice, utr->beg, utr->end);
3868             if ( splice_ret!=SPLICE_INSIDE && splice_ret!=SPLICE_OVERLAP ) continue;
3869             csq_t csq;
3870             memset(&csq, 0, sizeof(csq_t));
3871             csq.pos          = rec->pos;
3872             csq.type.type    = utr->which==prime5 ? CSQ_UTR5 : CSQ_UTR3;
3873             csq.type.biotype = tr->type;
3874             csq.type.strand  = tr->strand;
3875             csq.type.trid    = tr->id;
3876             csq.type.vcf_ial = i;
3877             csq.type.gene    = tr->gene->name;
3878             csq_stage(args, &csq, rec);
3879             ret = 1;
3880         }
3881     }
3882     assert(!splice.kref.s);
3883     assert(!splice.kalt.s);
3884     return ret;
3885 }
test_splice(args_t * args,bcf1_t * rec)3886 int test_splice(args_t *args, bcf1_t *rec)
3887 {
3888     const char *chr = bcf_seqname(args->hdr,rec);
3889     if ( !regidx_overlap(args->idx_exon,chr,rec->pos,rec->pos + rec->rlen, args->itr) ) return 0;
3890 
3891     splice_t splice;
3892     splice_init(&splice, rec);
3893     splice.check_acceptor = splice.check_donor = 1;
3894 
3895     int i, ret = 0;
3896     while ( regitr_overlap(args->itr) )
3897     {
3898         gf_exon_t *exon = regitr_payload(args->itr, gf_exon_t*);
3899         splice.tr = exon->tr;
3900         if ( !splice.tr->ncds ) continue;  // not a coding transcript, no interest in splice sites
3901 
3902         splice.check_region_beg = splice.tr->beg==exon->beg ? 0 : 1;
3903         splice.check_region_end = splice.tr->end==exon->end ? 0 : 1;
3904 
3905         for (i=1; i<rec->n_allele; i++)
3906         {
3907             if ( rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*' ) { continue; }
3908             splice.vcf.alt = rec->d.allele[i];
3909             splice.csq     = 0;
3910             splice_csq(args, &splice, exon->beg, exon->end);
3911             if ( splice.csq ) ret = 1;
3912         }
3913     }
3914     free(splice.kref.s);
3915     free(splice.kalt.s);
3916     return ret;
3917 }
test_tscript(args_t * args,bcf1_t * rec)3918 int test_tscript(args_t *args, bcf1_t *rec)
3919 {
3920     const char *chr = bcf_seqname(args->hdr,rec);
3921     if ( !regidx_overlap(args->idx_tscript,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0;
3922 
3923     splice_t splice;
3924     splice_init(&splice, rec);
3925 
3926     int i, ret = 0;
3927     while ( regitr_overlap(args->itr) )
3928     {
3929         tscript_t *tr = splice.tr = regitr_payload(args->itr, tscript_t*);
3930         for (i=1; i<rec->n_allele; i++)
3931         {
3932             if ( rec->d.allele[i][0]=='<' || rec->d.allele[i][0]=='*' ) { continue; }
3933             splice.vcf.alt = rec->d.allele[i];
3934             splice.csq     = 0;
3935             int splice_ret = splice_csq(args, &splice, tr->beg, tr->end);
3936             if ( splice_ret!=SPLICE_INSIDE && splice_ret!=SPLICE_OVERLAP ) continue;    // SPLICE_OUTSIDE or SPLICE_REF
3937             csq_t csq;
3938             memset(&csq, 0, sizeof(csq_t));
3939             csq.pos          = rec->pos;
3940             csq.type.type    = GF_is_coding(tr->type) ? CSQ_INTRON : CSQ_NON_CODING;
3941             csq.type.biotype = tr->type;
3942             csq.type.strand  = tr->strand;
3943             csq.type.trid    = tr->id;
3944             csq.type.gene    = tr->gene->name;
3945             csq_stage(args, &csq, rec);
3946             ret = 1;
3947         }
3948     }
3949     assert(!splice.kref.s);
3950     assert(!splice.kalt.s);
3951     return ret;
3952 }
3953 
test_symbolic_alt(args_t * args,bcf1_t * rec)3954 void test_symbolic_alt(args_t *args, bcf1_t *rec)
3955 {
3956     static int warned = 0;
3957     if ( args->verbosity && (!warned && args->verbosity > 0) )
3958     {
3959         fprintf(stderr,"Warning: The support for symbolic ALT insertions is experimental.\n");
3960         warned = 1;
3961     }
3962 
3963     const char *chr = bcf_seqname(args->hdr,rec);
3964 
3965     // only insertions atm
3966     int beg = rec->pos + 1;
3967     int end = beg;
3968     int csq_class = CSQ_ELONGATION;
3969 
3970     int hit = 0;
3971     if ( regidx_overlap(args->idx_cds,chr,beg,end, args->itr) )
3972     {
3973         while ( regitr_overlap(args->itr) )
3974         {
3975             csq_t csq;
3976             memset(&csq, 0, sizeof(csq_t));
3977             gf_cds_t *cds    = regitr_payload(args->itr,gf_cds_t*);
3978             tscript_t *tr    = cds->tr;
3979             csq.type.type    = (GF_is_coding(tr->type) ? CSQ_CODING_SEQUENCE : CSQ_NON_CODING) | csq_class;
3980             csq.pos          = rec->pos;
3981             csq.type.biotype = tr->type;
3982             csq.type.strand  = tr->strand;
3983             csq.type.trid    = tr->id;
3984             csq.type.gene    = tr->gene->name;
3985             csq_stage(args, &csq, rec);
3986             hit = 1;
3987         }
3988     }
3989     if ( regidx_overlap(args->idx_utr,chr,beg,end, args->itr) )
3990     {
3991         while ( regitr_overlap(args->itr) )
3992         {
3993             csq_t csq;
3994             memset(&csq, 0, sizeof(csq_t));
3995             gf_utr_t *utr    = regitr_payload(args->itr, gf_utr_t*);
3996             tscript_t *tr    = utr->tr;
3997             csq.type.type    = (utr->which==prime5 ? CSQ_UTR5 : CSQ_UTR3) | csq_class;
3998             csq.pos          = rec->pos;
3999             csq.type.biotype = tr->type;
4000             csq.type.strand  = tr->strand;
4001             csq.type.trid    = tr->id;
4002             csq.type.gene    = tr->gene->name;
4003             csq_stage(args, &csq, rec);
4004             hit = 1;
4005         }
4006     }
4007     if ( regidx_overlap(args->idx_exon,chr,beg,end, args->itr) )
4008     {
4009         splice_t splice;
4010         splice_init(&splice, rec);
4011         splice.check_acceptor = splice.check_donor = 1;
4012 
4013         while ( regitr_overlap(args->itr) )
4014         {
4015             gf_exon_t *exon = regitr_payload(args->itr, gf_exon_t*);
4016             splice.tr = exon->tr;
4017             if ( !splice.tr->ncds ) continue;  // not a coding transcript, no interest in splice sites
4018             splice.check_region_beg = splice.tr->beg==exon->beg ? 0 : 1;
4019             splice.check_region_end = splice.tr->end==exon->end ? 0 : 1;
4020             splice.vcf.alt = rec->d.allele[1];
4021             splice.csq     = csq_class;
4022             splice_csq(args, &splice, exon->beg, exon->end);
4023             if ( splice.csq ) hit = 1;
4024         }
4025     }
4026     if ( !hit && regidx_overlap(args->idx_tscript,chr,beg,end, args->itr) )
4027     {
4028         splice_t splice;
4029         splice_init(&splice, rec);
4030 
4031         while ( regitr_overlap(args->itr) )
4032         {
4033             csq_t csq;
4034             memset(&csq, 0, sizeof(csq_t));
4035             tscript_t *tr = splice.tr = regitr_payload(args->itr, tscript_t*);
4036             splice.vcf.alt = rec->d.allele[1];
4037             splice.csq     = csq_class;
4038             int splice_ret = splice_csq(args, &splice, tr->beg, tr->end);
4039             if ( splice_ret!=SPLICE_INSIDE && splice_ret!=SPLICE_OVERLAP ) continue;    // SPLICE_OUTSIDE or SPLICE_REF
4040             csq.type.type    = (GF_is_coding(tr->type) ? CSQ_INTRON : CSQ_NON_CODING) | csq_class;
4041             csq.pos          = rec->pos;
4042             csq.type.biotype = tr->type;
4043             csq.type.strand  = tr->strand;
4044             csq.type.trid    = tr->id;
4045             csq.type.gene    = tr->gene->name;
4046             csq_stage(args, &csq, rec);
4047         }
4048     }
4049 }
4050 
debug_print_buffers(args_t * args,int pos)4051 void debug_print_buffers(args_t *args, int pos)
4052 {
4053     int i,j;
4054     fprintf(stderr,"debug_print_buffers at %d\n", pos);
4055     fprintf(stderr,"vbufs:\n");
4056     for (i=0; i<args->vcf_rbuf.n; i++)
4057     {
4058         int k = rbuf_kth(&args->vcf_rbuf, i);
4059         vbuf_t *vbuf = args->vcf_buf[k];
4060 
4061         fprintf(stderr,"\tvbuf %d:\n", i);
4062         for (j=0; j<vbuf->n; j++)
4063         {
4064             vrec_t *vrec = vbuf->vrec[j];
4065             fprintf(stderr,"\t\t%"PRId64" .. nvcsq=%d\n", (int64_t) vrec->line->pos+1, vrec->nvcsq);
4066         }
4067     }
4068     fprintf(stderr,"pos2vbuf:");
4069     khint_t k;
4070     for (k = 0; k < kh_end(args->pos2vbuf); ++k)
4071         if (kh_exist(args->pos2vbuf, k)) fprintf(stderr," %d",1+(int)kh_key(args->pos2vbuf, k));
4072     fprintf(stderr,"\n");
4073     fprintf(stderr,"active_tr: %d\n", args->active_tr->ndat);
4074 }
4075 
process(args_t * args,bcf1_t ** rec_ptr)4076 static void process(args_t *args, bcf1_t **rec_ptr)
4077 {
4078     if ( !rec_ptr )
4079     {
4080         hap_flush(args, REGIDX_MAX);
4081         vbuf_flush(args, REGIDX_MAX);
4082         return;
4083     }
4084 
4085     bcf1_t *rec = *rec_ptr;
4086     static int32_t prev_rid = -1, prev_pos = -1;
4087     if ( prev_rid!=rec->rid )
4088     {
4089         prev_rid = rec->rid;
4090         prev_pos = rec->pos;
4091 
4092         // Common error is to use different naming conventions in the fasta and the VCF (e.g. X vs chrX).
4093         // Perform a simple sanity check (that does not catch much), the chromosome must be present in the
4094         // reference file
4095         if ( !faidx_has_seq(args->fai,bcf_seqname(args->hdr,rec)) )
4096             error("Error: the chromosome \"%s\" is not present in %s\n",bcf_seqname(args->hdr,rec),args->fa_fname);
4097     }
4098     if ( prev_pos > rec->pos )
4099         error("Error: The file is not sorted, %s:%d comes before %s:%"PRId64"\n",bcf_seqname(args->hdr,rec),prev_pos+1,bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
4100 
4101     int call_csq = 1;
4102     if ( rec->n_allele < 2 ) call_csq = 0;   // no alternate allele
4103     else if ( rec->n_allele==2 && (rec->d.allele[1][0]=='*' || rec->d.allele[1][1]=='*') ) call_csq = 0;     // gVCF, not an alt allele
4104     else if ( rec->d.allele[1][0]=='<' )
4105     {
4106         if ( strncmp("<INS",rec->d.allele[1], 4) ) call_csq = 0;    // only <INS[:.*]> is supported at the moment
4107     }
4108     if ( call_csq && args->filter )
4109     {
4110         call_csq = filter_test(args->filter, rec, NULL);
4111         if ( args->filter_logic==FLT_EXCLUDE ) call_csq = call_csq ? 0 : 1;
4112     }
4113     if ( !call_csq )
4114     {
4115         if ( !args->out_fh ) return;    // not a VCF output
4116         vbuf_push(args, rec_ptr);
4117         hap_flush(args, rec->pos-1);
4118         vbuf_flush(args, rec->pos-1);
4119         return;
4120     }
4121 
4122     if ( args->rid != rec->rid )
4123     {
4124         hap_flush(args, REGIDX_MAX);
4125         vbuf_flush(args, REGIDX_MAX);
4126     }
4127     args->rid = rec->rid;
4128     vbuf_t *vbuf = vbuf_push(args, rec_ptr);
4129 
4130     if ( rec->d.allele[1][0]!='<' )
4131     {
4132         int hit = args->local_csq ? test_cds_local(args, rec) : test_cds(args, rec, vbuf);
4133         hit += test_utr(args, rec);
4134         hit += test_splice(args, rec);
4135         if ( !hit ) test_tscript(args, rec);
4136     }
4137     else
4138         test_symbolic_alt(args, rec);
4139 
4140     if ( rec->pos > 0 )
4141     {
4142         hap_flush(args, rec->pos-1);
4143         vbuf_flush(args, rec->pos-1);
4144     }
4145 
4146     return;
4147 }
4148 
usage(void)4149 static const char *usage(void)
4150 {
4151     return
4152         "\n"
4153         "About: Haplotype-aware consequence caller.\n"
4154         "Usage: bcftools csq [OPTIONS] in.vcf\n"
4155         "\n"
4156         "Required options:\n"
4157         "   -f, --fasta-ref FILE              Reference file in fasta format\n"
4158         "   -g, --gff-annot FILE              GFF3 annotation file\n"
4159         "\n"
4160         "CSQ options:\n"
4161         "   -B, --trim-protein-seq INT        Abbreviate protein-changing predictions to max INT aminoacids\n"
4162         "   -c, --custom-tag STRING           Use this tag instead of the default BCSQ\n"
4163         "   -l, --local-csq                   Localized predictions, consider only one VCF record at a time\n"
4164         "   -n, --ncsq INT                    Maximum number of per-haplotype consequences to consider for each site [15]\n"
4165         "   -p, --phase a|m|r|R|s             How to handle unphased heterozygous genotypes: [r]\n"
4166         "                                       a: take GTs as is, create haplotypes regardless of phase (0/1 -> 0|1)\n"
4167         "                                       m: merge *all* GTs into a single haplotype (0/1 -> 1, 1/2 -> 1)\n"
4168         "                                       r: require phased GTs, throw an error on unphased het GTs\n"
4169         "                                       R: create non-reference haplotypes if possible (0/1 -> 1|1, 1/2 -> 1|2)\n"
4170         "                                       s: skip unphased hets\n"
4171         "Options:\n"
4172         "   -e, --exclude EXPR                Exclude sites for which the expression is true\n"
4173         "       --force                       Run even if some sanity checks fail\n"
4174         "   -i, --include EXPR                Select sites for which the expression is true\n"
4175         "       --no-version                  Do not append version and command line to the header\n"
4176         "   -o, --output FILE                 Write output to a file [standard output]\n"
4177         "   -O, --output-type b|u|z|v|t[0-9]  b: compressed BCF, u: uncompressed BCF, z: compressed VCF\n"
4178         "                                     v: uncompressed VCF, t: plain tab-delimited text output, 0-9: compression level [v]\n"
4179         "   -r, --regions REGION              Restrict to comma-separated list of regions\n"
4180         "   -R, --regions-file FILE           Restrict to regions listed in a file\n"
4181         "       --regions-overlap 0|1|2       Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n"
4182         "   -s, --samples -|LIST              Samples to include or \"-\" to apply all variants and ignore samples\n"
4183         "   -S, --samples-file FILE           Samples to include\n"
4184         "   -t, --targets REGION              Similar to -r but streams rather than index-jumps\n"
4185         "   -T, --targets-file FILE           Similar to -R but streams rather than index-jumps\n"
4186         "       --targets-overlap 0|1|2       Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"
4187         "       --threads INT                 Use multithreading with <int> worker threads [0]\n"
4188         "   -v, --verbose INT                 Verbosity level 0-2 [1]\n"
4189         "\n"
4190         "Example:\n"
4191         "   bcftools csq -f hs37d5.fa -g Homo_sapiens.GRCh37.82.gff3.gz in.vcf\n"
4192         "\n"
4193         "   # GFF3 annotation files can be downloaded from Ensembl. e.g. for human:\n"
4194         "   ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens/\n"
4195         "   ftp://ftp.ensembl.org/pub/grch37/release-84/gff3/homo_sapiens/\n"
4196         "\n";
4197 }
4198 
main_csq(int argc,char * argv[])4199 int main_csq(int argc, char *argv[])
4200 {
4201     args_t *args = (args_t*) calloc(1,sizeof(args_t));
4202     args->argc = argc; args->argv = argv;
4203     args->output_type = FT_VCF;
4204     args->bcsq_tag = "BCSQ";
4205     args->ncsq2_max = 2*(16-1);      // 1 bit is reserved for BCF missing values
4206     args->verbosity = 1;
4207     args->record_cmd_line = 1;
4208     args->clevel = -1;
4209 
4210     static struct option loptions[] =
4211     {
4212         {"force",0,0,1},
4213         {"threads",required_argument,NULL,2},
4214         {"help",0,0,'h'},
4215         {"ncsq",1,0,'n'},
4216         {"brief-predictions",no_argument,0,'b'},
4217         {"trim-protein-seq",required_argument,0,'B'},
4218         {"custom-tag",1,0,'c'},
4219         {"local-csq",0,0,'l'},
4220         {"gff-annot",1,0,'g'},
4221         {"fasta-ref",1,0,'f'},
4222         {"include",1,0,'i'},
4223         {"exclude",1,0,'e'},
4224         {"output",1,0,'o'},
4225         {"output-type",1,NULL,'O'},
4226         {"phase",1,0,'p'},
4227         {"quiet",0,0,'q'},
4228         {"verbose",1,0,'v'},
4229         {"regions",1,0,'r'},
4230         {"regions-file",1,0,'R'},
4231         {"regions-overlap",required_argument,NULL,4},
4232         {"samples",1,0,'s'},
4233         {"samples-file",1,0,'S'},
4234         {"targets",1,0,'t'},
4235         {"targets-file",1,0,'T'},
4236         {"targets-overlap",required_argument,NULL,5},
4237         {"no-version",no_argument,NULL,3},
4238         {0,0,0,0}
4239     };
4240     int c, targets_is_file = 0, regions_is_file = 0;
4241     int regions_overlap = 1;
4242     int targets_overlap = 0;
4243     char *targets_list = NULL, *regions_list = NULL, *tmp;
4244     while ((c = getopt_long(argc, argv, "?hr:R:t:T:i:e:f:o:O:g:s:S:p:qc:ln:bB:v:",loptions,NULL)) >= 0)
4245     {
4246         switch (c)
4247         {
4248             case  1 : args->force = 1; break;
4249             case  2 :
4250                 args->n_threads = strtol(optarg,&tmp,10);
4251                 if ( *tmp ) error("Could not parse argument: --threads  %s\n", optarg);
4252                 break;
4253             case  3 : args->record_cmd_line = 0; break;
4254             case 'b':
4255                     args->brief_predictions = 1;
4256                     fprintf(stderr,"Warning: the -b option will be removed in future versions. Please use -B 1 instead.\n");
4257                     break;
4258             case 'B':
4259                     args->brief_predictions = strtol(optarg,&tmp,10);
4260                     if ( *tmp || args->brief_predictions<1 ) error("Could not parse argument: --trim-protein-seq %s\n", optarg);
4261                     break;
4262             case 'l': args->local_csq = 1; break;
4263             case 'c': args->bcsq_tag = optarg; break;
4264             case 'q': error("Error: the -q option has been deprecated, use -v, --verbose instead.\n"); break;
4265             case 'v':
4266                 args->verbosity = atoi(optarg);
4267                 if ( args->verbosity<0 || args->verbosity>2 ) error("Error: expected integer 0-2 with -v, --verbose\n");
4268                 break;
4269             case 'p':
4270                 switch (optarg[0])
4271                 {
4272                     case 'a': args->phase = PHASE_AS_IS; break;
4273                     case 'm': args->phase = PHASE_MERGE; break;
4274                     case 'r': args->phase = PHASE_REQUIRE; break;
4275                     case 'R': args->phase = PHASE_NON_REF; break;
4276                     case 's': args->phase = PHASE_SKIP; break;
4277                     default: error("The -p code \"%s\" not recognised\n", optarg);
4278                 }
4279                 break;
4280             case 'f': args->fa_fname = optarg; break;
4281             case 'g': args->gff_fname = optarg; break;
4282             case 'n':
4283                 args->ncsq2_max = 2 * atoi(optarg);
4284                 if ( args->ncsq2_max <= 0 ) error("Expected positive integer with -n, got %s\n", optarg);
4285                 break;
4286             case 'o': args->output_fname = optarg; break;
4287             case 'O':
4288                       switch (optarg[0]) {
4289                           case 't': args->output_type = FT_TAB_TEXT; break;
4290                           case 'b': args->output_type = FT_BCF_GZ; break;
4291                           case 'u': args->output_type = FT_BCF; break;
4292                           case 'z': args->output_type = FT_VCF_GZ; break;
4293                           case 'v': args->output_type = FT_VCF; break;
4294                           default:
4295                           {
4296                               args->clevel = strtol(optarg,&tmp,10);
4297                               if ( *tmp || args->clevel<0 || args->clevel>9 ) error("The output type \"%s\" not recognised\n", optarg);
4298                           }
4299                       }
4300                       if ( optarg[1] )
4301                       {
4302                           args->clevel = strtol(optarg+1,&tmp,10);
4303                           if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --output-type %s\n", optarg+1);
4304                       }
4305                       break;
4306             case 'e':
4307                 if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
4308                 args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
4309             case 'i':
4310                 if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
4311                 args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
4312             case 'r': regions_list = optarg; break;
4313             case 'R': regions_list = optarg; regions_is_file = 1; break;
4314             case 's': args->sample_list = optarg; break;
4315             case 'S': args->sample_list = optarg; args->sample_is_file = 1; break;
4316             case 't': targets_list = optarg; break;
4317             case 'T': targets_list = optarg; targets_is_file = 1; break;
4318             case  4 :
4319                 if ( !strcasecmp(optarg,"0") ) regions_overlap = 0;
4320                 else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1;
4321                 else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2;
4322                 else error("Could not parse: --regions-overlap %s\n",optarg);
4323                 break;
4324             case  5 :
4325                 if ( !strcasecmp(optarg,"0") ) targets_overlap = 0;
4326                 else if ( !strcasecmp(optarg,"1") ) targets_overlap = 1;
4327                 else if ( !strcasecmp(optarg,"2") ) targets_overlap = 2;
4328                 else error("Could not parse: --targets-overlap %s\n",optarg);
4329                 break;
4330             case 'h':
4331             case '?': error("%s",usage());
4332             default: error("The option not recognised: %s\n\n", optarg); break;
4333         }
4334     }
4335     char *fname = NULL;
4336     if ( optind==argc )
4337     {
4338         if ( !isatty(fileno((FILE *)stdin)) ) fname = "-";  // reading from stdin
4339         else error("%s", usage());
4340     }
4341     else fname = argv[optind];
4342     if ( argc - optind>1 ) error("%s", usage());
4343     if ( !args->fa_fname ) error("Missing the --fa-ref option\n");
4344     if ( !args->gff_fname ) error("Missing the --gff option\n");
4345     args->sr = bcf_sr_init();
4346     if ( targets_list )
4347     {
4348         bcf_sr_set_opt(args->sr,BCF_SR_TARGETS_OVERLAP,targets_overlap);
4349         if ( bcf_sr_set_targets(args->sr, targets_list, targets_is_file, 0)<0 )
4350             error("Failed to read the targets: %s\n", targets_list);
4351     }
4352     if ( regions_list )
4353     {
4354         bcf_sr_set_opt(args->sr,BCF_SR_REGIONS_OVERLAP,regions_overlap);
4355         if ( bcf_sr_set_regions(args->sr, regions_list, regions_is_file)<0 )
4356             error("Failed to read the regions: %s\n", regions_list);
4357     }
4358     if ( bcf_sr_set_threads(args->sr, args->n_threads)<0 ) error("Failed to create %d extra threads\n", args->n_threads);
4359     if ( !bcf_sr_add_reader(args->sr, fname) )
4360         error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->sr->errnum));
4361     args->hdr = bcf_sr_get_header(args->sr,0);
4362 
4363     init_data(args);
4364     while ( bcf_sr_next_line(args->sr) )
4365     {
4366         process(args, &args->sr->readers[0].buffer[0]);
4367     }
4368     process(args,NULL);
4369 
4370     destroy_data(args);
4371     bcf_sr_destroy(args->sr);
4372     free(args);
4373     return 0;
4374 }
4375 
4376