1 /* The MIT License 2 3 Copyright (c) 2016 Hyun Min Kang <hmkang.umich.edu> and Adrian Tan <atks@umich.edu> 4 5 Permission is hereby granted, free of charge, to any person obtaining a copy 6 of this software and associated documentation files (the "Software"), to deal 7 in the Software without restriction, including without limitation the rights 8 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 copies of the Software, and to permit persons to whom the Software is 10 furnished to do so, subject to the following conditions: 11 12 The above copyright notice and this permission notice shall be included in 13 all copies or substantial portions of the Software. 14 15 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 THE SOFTWARE. 22 */ 23 24 #ifndef GENOTYPING_RECORD_H 25 #define GENOTYPING_RECORD_H 26 27 #include "htslib/vcf.h" 28 #include "htslib/faidx.h" 29 #include "bcf_ordered_writer.h" 30 #include "variant.h" 31 #include "hts_utils.h" 32 #include "augmented_bam_record.h" 33 #include "estimator.h" 34 35 #define FILTER_MASK_OVERLAP_SNP 0x0001 36 #define FILTER_MASK_OVERLAP_INDEL 0x0002 37 #define FILTER_MASK_OVERLAP_VNTR 0x0004 38 39 /** 40 * A generic record that holds information for genotyping a 41 * variant across multiple samples. 42 * 43 * Maintains read information and allows for additional reads 44 * till VCF record can be printed out. 45 */ 46 class GenotypingRecord 47 { 48 public: 49 bcf_hdr_t *h; 50 bcf1_t *v; 51 int32_t rid; 52 int32_t pos1; //position of variant 53 //[beg1,end1] is the required overlapping of the variant against the aligned read necessary to make a genotype call. 54 //for SNPs, beg1=end1=pos1 55 // 56 //for Indels, this refers to the flanking positions 57 // insertion 58 // if T/TG - beg1=pos1, end1=pos1+1 59 // if T/GT - beg1=pos1-1, end1=pos1 60 // deletion 61 // if TG/T - beg1=pos1, end1=pos1+length(REF) 62 // if TG/G - beg1=pos1-1, end1=pos1+length(REF)-1 63 int32_t beg1; 64 int32_t end1; 65 int32_t vtype; 66 67 //indel specific record 68 int32_t dlen; 69 int32_t len; 70 std::string indel; 71 72 //vntr specific record 73 std::string motif; 74 75 std::vector<std::string> indel_alleles; 76 std::vector<float> counts; 77 78 //for records that observe at least one alternate observation 79 std::vector<uint32_t> bqs; //for SNPs, store BQ, for Indels, store AQ 80 std::vector<uint32_t> aqs; // store AQ 81 std::vector<uint32_t> mqs; //map qualities 82 std::string sts; //strands 83 std::vector<int32_t> als; //alleles 84 std::string dls; //descriptive alleles 85 std::vector<uint32_t> cys; //cycles 86 std::vector<uint32_t> nms; //number of mismatches 87 88 //for records that only have reference observation 89 uint32_t no_nonref; 90 std::vector<uint32_t> allele_depth_fwd; 91 std::vector<uint32_t> allele_depth_rev; 92 uint32_t depth, depth_fwd, depth_rev; 93 uint32_t base_qualities_sum; 94 95 96 //vntr specific record 97 //std::vector<float> counts; 98 99 // sample level information 100 int32_t nsamples; 101 kstring_t alleles; 102 std::vector<std::string> v_alleles; 103 uint32_t n_filter; 104 105 uint8_t* pls; 106 uint8_t* ads; 107 108 // sufficient statistics for computing INFO field 109 float bqr_num, bqr_den; 110 float mqr_num, mqr_den; 111 float cyr_num, cyr_den; 112 float str_num, str_den; 113 float nmr_num, nmr_den; 114 float ior_num, ior_den; 115 float nm0_num, nm0_den; 116 float nm1_num, nm1_den; 117 float abe_num, abe_den; 118 float abz_num, abz_den; 119 float ns_nref, dp_sum, max_gq; 120 121 // temporary information to be cleared out per-sample basis 122 int32_t tmp_dp_q20; 123 int32_t tmp_dp_ra; 124 int32_t tmp_bq_s1, tmp_bq_s2; 125 int32_t tmp_mq_s1, tmp_mq_s2; 126 float tmp_cy_s1, tmp_cy_s2; 127 int32_t tmp_st_s1, tmp_st_s2; 128 int32_t tmp_al_s1, tmp_bq_al, tmp_mq_al; 129 float tmp_cy_al; 130 int32_t tmp_st_al, tmp_nm_al; 131 int32_t tmp_nm_s1, tmp_nm_s2; 132 double tmp_oth_exp_q20, tmp_oth_obs_q20; 133 double tmp_pls[3]; 134 double tmp_ads[3]; 135 136 /** 137 * Constructor. 138 */ GenotypingRecord()139 GenotypingRecord() {}; 140 141 /** 142 * Constructor. 143 * 144 * @h - VCF header. 145 * @v - VCF record. 146 * nsamples - number of samples. 147 * ploidy - ploidy of this variant 148 * 149 * future todo: ploidy be an array of integers of length no_samples to allow for local copy number. 150 */ GenotypingRecord(bcf_hdr_t * h,bcf1_t * v,int32_t nsamples,int32_t ploidy)151 GenotypingRecord(bcf_hdr_t *h, bcf1_t *v, int32_t nsamples, int32_t ploidy) {}; 152 153 /** 154 * Destructor. 155 */ ~GenotypingRecord()156 virtual ~GenotypingRecord() {}; 157 158 /** 159 * Clears this record. 160 */ clear()161 virtual void clear() {}; 162 163 /** 164 * Clears the temporary variables. 165 */ clearTemp()166 virtual void clearTemp() {}; 167 168 /** 169 * Flushes variant. 170 * This returns a single line BCF recordfor all the samples. 171 */ flush_variant(bcf_hdr_t * hdr)172 virtual bcf1_t* flush_variant(bcf_hdr_t* hdr) {return NULL;}; 173 174 /** 175 * Flush sample. This is used for sequential reading of each sample. 176 */ flush_sample(int32_t sampleIndex)177 virtual void flush_sample( int32_t sampleIndex ) {}; 178 179 /** 180 * Clears this record. 181 */ add_allele(double contam,int32_t allele,uint8_t mapq,bool fwd,uint32_t q,int32_t cycle,uint32_t nm)182 virtual void add_allele( double contam, int32_t allele, uint8_t mapq, bool fwd, uint32_t q, int32_t cycle, uint32_t nm ) {}; 183 184 /** 185 * Clears this record. 186 */ process_read(AugmentedBAMRecord & as,int32_t sampleIndex,double contam)187 virtual void process_read(AugmentedBAMRecord& as, int32_t sampleIndex, double contam) {}; 188 }; 189 190 #endif 191