1 /* The MIT License
2 
3    Copyright (c) 2017 Adrian Tan <atks@umich.edu>
4 
5    Permission is hereby granted, free of charge, to any person obtaining a copy
6    of this software and associated documentation files (the "Software"), to deal
7    in the Software without restriction, including without limitation the rights
8    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9    copies of the Software, and to permit persons to whom the Software is
10    furnished to do so, subject to the following conditions:
11 
12    The above copyright notice and this permission notice shall be included in
13    all copies or substantial portions of the Software.
14 
15    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21    THE SOFTWARE.
22 */
23 
24 #include "indel_annotator.h"
25 
26 /**
27  * Constructor.
28  */
IndelAnnotator(std::string & ref_fasta_file,bool debug)29 IndelAnnotator::IndelAnnotator(std::string& ref_fasta_file, bool debug)
30 {
31     vm = new VariantManip(ref_fasta_file.c_str());
32 
33     fai = fai_load(ref_fasta_file.c_str());
34     if (fai==NULL)
35     {
36         fprintf(stderr, "[%s:%d %s] Cannot load genome index: %s\n", __FILE__, __LINE__, __FUNCTION__, ref_fasta_file.c_str());
37         exit(1);
38     }
39 
40     cre = new CandidateRegionExtractor(ref_fasta_file, debug);
41     cmp = new CandidateMotifPicker(debug);
42     fd = new FlankDetector(ref_fasta_file, debug);
43 
44     this->debug = debug;
45 };
46 
47 /**
48  * Destructor.
49  */
~IndelAnnotator()50 IndelAnnotator::~IndelAnnotator()
51 {
52     delete vm;
53     fai_destroy(fai);
54 }
55 
56 /**
57  * Annotates VNTR characteristics.
58  * @mode - e for exact alignment annotation
59  *       - f for fuzzy alignment annotation
60  *       -
61  */
annotate(Variant & variant)62 void IndelAnnotator::annotate(Variant& variant)
63 {
64     VNTR& vntr = variant.vntr;
65 
66     bcf_hdr_t* h = variant.h;
67     bcf1_t* v = variant.v;
68 
69     //update chromosome and position
70     variant.rid = bcf_get_rid(v);
71     variant.pos1 = bcf_get_pos1(v);
72 
73     //this is for reannotating an VNTR record
74     //this is more for the purpose of evaluation to
75     //check if vt's algorithm is concordant with
76     //VNTRs from other sources.
77     if (variant.type==VT_VNTR)
78     {
79         if (debug) std::cerr << "============================================\n";
80         if (debug) std::cerr << "ANNOTATING VNTR/STR \n";
81 
82         //1. pick candidate region
83         cre->pick_candidate_region(variant, REFERENCE, FINAL);
84 
85         //2. set motifs from info field
86         cmp->set_motif_from_info_field(variant);
87 
88         //3. compute purity scores
89         fd->compute_purity_score(variant, FINAL);
90 
91         //4. compute composition and sequence statistics
92         fd->compute_composition_and_entropy(variant, FINAL);
93     }
94     //main purpose - annotation of Indels.
95     else if (variant.type&VT_INDEL)
96     {
97         //the basic steps in annotating a TR
98         //
99         //1. extract a region that has a chance of containing the repeat units
100         //2. choose a set of candidate motifs and pick motif
101         //3. detect repeat region and evaluate
102         //4. iterate 2 and 3
103 
104         if (debug) std::cerr << "============================================\n";
105         if (debug) std::cerr << "ANNOTATING INDEL\n";
106 
107 
108 
109 
110 
111         //1. selects candidate region by fuzzy left and right alignment
112         cre->pick_candidate_region(variant, EXACT_LEFT_RIGHT_ALIGNMENT, EXACT);
113         cmp->update_exact_repeat_unit(variant);
114 
115         //2. detect candidate motifs from a reference sequence
116         cmp->generate_candidate_motifs(variant);
117 
118         //this cannot possibly fail as next_motif() guarantees it
119         if (!cmp->next_motif(variant, CHECK_MOTIF_PRESENCE_IN_ALLELE))
120         {
121             //fall back on exact motif chosen.
122             VNTR& vntr = variant.vntr;
123             vntr.fuzzy_motif = vntr.exact_motif;
124             vntr.fuzzy_ru = vntr.exact_ru;
125             vntr.fuzzy_basis = vntr.exact_basis;
126             vntr.fuzzy_mlen = vntr.exact_mlen;
127             vntr.fuzzy_blen = vntr.exact_blen;;
128 
129             if (debug) std::cerr << "updating fuzzy motif with exact motifs\n";
130         }
131 
132         fd->detect_flanks(variant, EXACT);
133         fd->compute_purity_score(variant, EXACT);
134         fd->compute_composition_and_entropy(variant, EXACT);
135 
136         fd->detect_flanks(variant, FUZZY);
137         fd->compute_purity_score(variant, FUZZY);
138         fd->compute_composition_and_entropy(variant, FUZZY);
139 
140         //introduce reiteration based on concordance and exact concordance.
141 
142         if (debug)
143         {
144             std::cerr << "\n";
145             vntr.print();
146             std::cerr << "\n";
147         }
148 
149         if (debug) std::cerr << "============================================\n";
150         return;
151 
152     }
153 }