1 /* The MIT License
2 
3    Copyright (c) 2016 Hyun Min Kang <hmkang.umich.edu> and Adrian Tan <atks@umich.edu>
4 
5    Permission is hereby granted, free of charge, to any person obtaining a copy
6    of this software and associated documentation files (the "Software"), to deal
7    in the Software without restriction, including without limitation the rights
8    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9    copies of the Software, and to permit persons to whom the Software is
10    furnished to do so, subject to the following conditions:
11 
12    The above copyright notice and this permission notice shall be included in
13    all copies or substantial portions of the Software.
14 
15    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21    THE SOFTWARE.
22 */
23 
24 #ifndef GENOTYPING_RECORD_H
25 #define GENOTYPING_RECORD_H
26 
27 #include "htslib/vcf.h"
28 #include "htslib/faidx.h"
29 #include "bcf_ordered_writer.h"
30 #include "variant.h"
31 #include "hts_utils.h"
32 #include "augmented_bam_record.h"
33 #include "estimator.h"
34 
35 #define FILTER_MASK_OVERLAP_SNP   0x0001
36 #define FILTER_MASK_OVERLAP_INDEL 0x0002
37 #define FILTER_MASK_OVERLAP_VNTR  0x0004
38 
39 /**
40  * A generic record that holds information for genotyping a
41  * variant across multiple samples.
42  *
43  * Maintains read information and allows for additional reads
44  * till VCF record can be printed out.
45  */
46 class GenotypingRecord
47 {
48     public:
49     bcf_hdr_t *h;
50     bcf1_t *v;
51     int32_t rid;
52     int32_t pos1; //position of variant
53     //[beg1,end1] is the required overlapping of the variant against the aligned read necessary to make a genotype call.
54     //for SNPs, beg1=end1=pos1
55     //
56     //for Indels, this refers to the flanking positions
57     //   insertion
58     //        if T/TG - beg1=pos1, end1=pos1+1
59     //        if T/GT - beg1=pos1-1, end1=pos1
60     //   deletion
61     //        if TG/T - beg1=pos1, end1=pos1+length(REF)
62     //        if TG/G - beg1=pos1-1, end1=pos1+length(REF)-1
63     int32_t beg1;
64     int32_t end1;
65     int32_t vtype;
66 
67     //indel specific record
68     int32_t dlen;
69     int32_t len;
70     std::string indel;
71 
72     //vntr specific record
73     std::string motif;
74 
75     std::vector<std::string> indel_alleles;
76     std::vector<float> counts;
77 
78     //for records that observe at least one alternate observation
79     std::vector<uint32_t> bqs;  //for SNPs, store BQ, for Indels, store AQ
80     std::vector<uint32_t> aqs;  // store AQ
81     std::vector<uint32_t> mqs;   //map qualities
82     std::string sts;      //strands
83     std::vector<int32_t> als; //alleles
84     std::string dls;  //descriptive alleles
85     std::vector<uint32_t> cys; //cycles
86     std::vector<uint32_t> nms;  //number of mismatches
87 
88     //for records that only have reference observation
89     uint32_t no_nonref;
90     std::vector<uint32_t> allele_depth_fwd;
91     std::vector<uint32_t> allele_depth_rev;
92     uint32_t depth, depth_fwd, depth_rev;
93     uint32_t base_qualities_sum;
94 
95 
96     //vntr specific record
97     //std::vector<float> counts;
98 
99     // sample level information
100     int32_t nsamples;
101     kstring_t alleles;
102     std::vector<std::string> v_alleles;
103     uint32_t n_filter;
104 
105     uint8_t* pls;
106     uint8_t* ads;
107 
108     // sufficient statistics for computing INFO field
109     float bqr_num, bqr_den;
110     float mqr_num, mqr_den;
111     float cyr_num, cyr_den;
112     float str_num, str_den;
113     float nmr_num, nmr_den;
114     float ior_num, ior_den;
115     float nm0_num, nm0_den;
116     float nm1_num, nm1_den;
117     float abe_num, abe_den;
118     float abz_num, abz_den;
119     float ns_nref, dp_sum, max_gq;
120 
121     // temporary information to be cleared out per-sample basis
122     int32_t tmp_dp_q20;
123     int32_t tmp_dp_ra;
124     int32_t tmp_bq_s1, tmp_bq_s2;
125     int32_t tmp_mq_s1, tmp_mq_s2;
126     float tmp_cy_s1, tmp_cy_s2;
127     int32_t tmp_st_s1, tmp_st_s2;
128     int32_t tmp_al_s1, tmp_bq_al, tmp_mq_al;
129     float  tmp_cy_al;
130     int32_t tmp_st_al, tmp_nm_al;
131     int32_t tmp_nm_s1, tmp_nm_s2;
132     double tmp_oth_exp_q20, tmp_oth_obs_q20;
133     double tmp_pls[3];
134     double tmp_ads[3];
135 
136     /**
137      * Constructor.
138      */
GenotypingRecord()139     GenotypingRecord() {};
140 
141     /**
142      * Constructor.
143      *
144      * @h       - VCF header.
145      * @v       - VCF record.
146      * nsamples - number of samples.
147      * ploidy   - ploidy of this variant
148      *
149      * future todo: ploidy be an array of integers of length no_samples to allow for local copy number.
150      */
GenotypingRecord(bcf_hdr_t * h,bcf1_t * v,int32_t nsamples,int32_t ploidy)151     GenotypingRecord(bcf_hdr_t *h, bcf1_t *v, int32_t nsamples, int32_t ploidy) {};
152 
153     /**
154      * Destructor.
155      */
~GenotypingRecord()156     virtual ~GenotypingRecord() {};
157 
158     /**
159      * Clears this record.
160      */
clear()161     virtual void clear() {};
162 
163     /**
164      * Clears the temporary variables.
165      */
clearTemp()166     virtual void clearTemp() {};
167 
168     /**
169      * Flushes variant.
170      * This returns a single line BCF recordfor all the samples.
171      */
flush_variant(bcf_hdr_t * hdr)172     virtual bcf1_t* flush_variant(bcf_hdr_t* hdr) {return NULL;};
173 
174     /**
175      * Flush sample.  This is used for sequential reading of each sample.
176      */
flush_sample(int32_t sampleIndex)177     virtual void flush_sample( int32_t sampleIndex ) {};
178 
179     /**
180      * Clears this record.
181      */
add_allele(double contam,int32_t allele,uint8_t mapq,bool fwd,uint32_t q,int32_t cycle,uint32_t nm)182     virtual void add_allele( double contam, int32_t allele, uint8_t mapq, bool fwd, uint32_t q, int32_t cycle, uint32_t nm ) {};
183 
184     /**
185      * Clears this record.
186      */
process_read(AugmentedBAMRecord & as,int32_t sampleIndex,double contam)187     virtual void process_read(AugmentedBAMRecord& as, int32_t sampleIndex, double contam) {};
188 };
189 
190 #endif
191