1 /* The MIT License
2 
3    Copyright (c) 2015 Adrian Tan <atks@umich.edu>
4 
5    Permission is hereby granted, free of charge, to any person obtaining a copy
6    of this software and associated documentation files (the "Software"), to deal
7    in the Software without restriction, including without limitation the rights
8    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9    copies of the Software, and to permit persons to whom the Software is
10    furnished to do so, subject to the following conditions:
11 
12    The above copyright notice and this permission notice shall be included in
13    all copies or substantial portions of the Software.
14 
15    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21    THE SOFTWARE.
22 */
23 
24 #include "candidate_region_extractor.h"
25 
26 /**
27  * Constructor.
28  */
CandidateRegionExtractor(std::string & ref_fasta_file,bool debug)29 CandidateRegionExtractor::CandidateRegionExtractor(std::string& ref_fasta_file, bool debug)
30 {
31     vm = new VariantManip(ref_fasta_file.c_str());
32     rs = new ReferenceSequence(ref_fasta_file);
33 
34     this->debug = debug;
35 };
36 
37 /**
38  * Destructor.
39  */
~CandidateRegionExtractor()40 CandidateRegionExtractor::~CandidateRegionExtractor()
41 {
42     delete vm;
43     delete rs;
44 }
45 
46 /**
47  * Pick candidate region.
48  *
49  * @mode - REFERENCE     use refence field
50  *       - ALLELE_EXACT  by exact alignment
51  *       - ALLELE_FUZZY  by fuzzy alignment
52  */
pick_candidate_region(Variant & variant,int32_t mode,int32_t amode)53 void CandidateRegionExtractor::pick_candidate_region(Variant& variant, int32_t mode, int32_t amode)
54 {
55     bcf_hdr_t* h = variant.h;
56     bcf1_t* v = variant.v;
57 
58     if (mode==REFERENCE)
59     {
60         if (amode&FINAL)
61         {
62             VNTR& vntr = variant.vntr;
63             vntr.repeat_tract.assign(bcf_get_ref(v));
64             vntr.beg1 = bcf_get_pos1(v);
65             vntr.end1 = bcf_get_end1(v);
66             vntr.rl = vntr.end1-vntr.beg1+1;
67             vntr.ll = vntr.rl; //??
68         }
69 
70         if (amode&EXACT)
71         {
72             VNTR& vntr = variant.vntr;
73             vntr.exact_repeat_tract.assign(bcf_get_ref(v));
74             vntr.exact_beg1 = bcf_get_pos1(v);
75             vntr.exact_end1 = bcf_get_end1(v);
76             vntr.exact_rl = vntr.exact_end1-vntr.exact_beg1+1;
77             vntr.exact_ll = vntr.exact_rl; //??
78         }
79 
80         if (amode&FUZZY)
81         {
82             VNTR& vntr = variant.vntr;
83             vntr.fuzzy_repeat_tract.assign(bcf_get_ref(v));
84             vntr.fuzzy_beg1 = bcf_get_pos1(v);
85             vntr.fuzzy_end1 = bcf_get_end1(v);
86             vntr.fuzzy_rl = vntr.fuzzy_end1-vntr.fuzzy_beg1+1;
87             vntr.fuzzy_ll = vntr.fuzzy_rl; //??
88         }
89     }
90     else if (mode==EXACT_LEFT_RIGHT_ALIGNMENT)
91     {
92         if (amode==EXACT)
93         {
94             extract_regions_by_exact_alignment(variant);
95         }
96         else
97         {
98             fprintf(stderr, "[E:%s] EXACT_LEFT_RIGHT_ALIGNMENT cannot be updated for any other attribute types beside EXACT.\n", __FUNCTION__);
99             exit(1);
100         }
101     }
102     else if (mode==FUZZY_LEFT_RIGHT_ALIGNMENT)
103     {
104         if (amode==FUZZY)
105         {
106             extract_regions_by_fuzzy_alignment(variant);
107         }
108         else
109         {
110             fprintf(stderr, "[E:%s] EXACT_LEFT_RIGHT_ALIGNMENT cannot be updated for any other attribute types beside FUZZY.\n", __FUNCTION__);
111             exit(1);
112         }
113     }
114 }
115 
116 /**
117  * Chooses a phase of the motif that is appropriate for the alignment
118  */
choose_repeat_unit(std::string & ref,std::string & motif)119 std::string CandidateRegionExtractor::choose_repeat_unit(std::string& ref, std::string& motif)
120 {
121     for (uint32_t i=0; i<motif.size(); ++i)
122     {
123         std::string smotif = VNTR::shift_str(motif, i);
124         if (ref.compare(0, smotif.size(), smotif)==0)
125         {
126             return smotif;
127         }
128     }
129 
130     return motif;
131 }
132 
133 /**
134  * Checks if a vntr is a homopolymer.
135  */
is_homopolymer(bcf_hdr_t * h,bcf1_t * v)136 bool CandidateRegionExtractor::is_homopolymer(bcf_hdr_t* h, bcf1_t* v)
137 {
138     bool is_homopolymer = false;
139     uint32_t ref_len = strlen(bcf_get_ref(v));
140     for (size_t i=1; i<bcf_get_n_allele(v); ++i)
141     {
142         std::string ref(bcf_get_alt(v, 0));
143         std::string alt(bcf_get_alt(v, i));
144         int32_t pos1 = bcf_get_pos1(v);
145     }
146 
147     return is_homopolymer;
148 }
149 
150 /**
151  * Extract reference sequence region for motif discovery.
152  *
153  * The input is a VCF record that contains an indel.
154  *
155  * If the the indel has multiple alleles, it will examine all
156  * alleles.
157  *
158  * todo: is might be a good idea to combine this step with motif detection
159  *       since there seems to be a need to have an iterative process here
160  *       to ensure a good candidate motif is chosen. *
161  */
extract_regions_by_exact_alignment(Variant & variant)162 void CandidateRegionExtractor::extract_regions_by_exact_alignment(Variant& variant)
163 {
164     if (debug)
165     {
166         if (debug) std::cerr << "********************************************\n";
167         std::cerr << "EXTRACTIING REGION BY EXACT LEFT AND RIGHT ALIGNMENT\n\n";
168     }
169 
170     bcf_hdr_t* h = variant.h;
171     bcf1_t* v = variant.v;
172 
173     VNTR& vntr = variant.vntr;
174     std::string& chrom = variant.chrom;
175 
176     int32_t min_beg1 = bcf_get_pos1(v);
177     int32_t max_end1 = min_beg1;
178 
179     if (debug)
180     {
181        bcf_print_liten(h, v);
182     }
183 
184     //merge candidate search region
185     for (size_t i=1; i<bcf_get_n_allele(v); ++i)
186     {
187         std::string ref(bcf_get_alt(v, 0));
188         std::string alt(bcf_get_alt(v, i));
189         int32_t pos1 = bcf_get_pos1(v);
190 
191         //this prevents introduction of flanks that do not harbour the repeat unit
192         trim(pos1, ref, alt);
193 
194         int32_t end1 = pos1 + ref.size() - 1;
195         std::cerr << end1 << " " << ref << "  " << alt << "\n";
196 
197         right_align(chrom, end1, ref, alt);
198 
199         std::cerr << end1 << " " << ref << "  " << alt << "\n";
200 
201 
202         int32_t beg1 = end1 - ref.size() + 1;
203         left_align(chrom, beg1, ref, alt);
204 
205         min_beg1 = beg1<min_beg1 ? beg1 : min_beg1;
206         max_end1 = end1>max_end1 ? end1 : max_end1;
207 
208         int32_t seq_len;
209         char* seq = rs->fetch_seq(chrom, min_beg1, max_end1);
210 
211         if (debug)
212         {
213             std::cerr << "EXACT REGION " << min_beg1 << "-" << max_end1 << " (" << max_end1-min_beg1+1 <<") from " << pos1 << ":" << ref << ":" << alt << "\n";
214             std::cerr << "             " << seq << "\n";
215         }
216 
217         if (seq) free(seq);
218     }
219 
220     char* seq = rs->fetch_seq(chrom, min_beg1, max_end1);
221 
222     if (debug)
223     {
224         std::cerr << "FINAL EXACT REGION " << min_beg1 << "-" << max_end1 << " (" << max_end1-min_beg1+1 <<") " << "\n";
225         std::cerr << "                   " << seq << "\n";
226     }
227 
228     vntr.exact_repeat_tract = seq;
229     vntr.rid = bcf_get_rid(v);
230     vntr.exact_beg1 = min_beg1;
231     vntr.exact_end1 = max_end1;
232 
233     if (seq) free(seq);
234 }
235 
236 /**
237  * Left align alleles.
238  */
left_align(std::string & chrom,int32_t & pos1,std::string & ref,std::string & alt)239 void CandidateRegionExtractor::left_align(std::string& chrom, int32_t& pos1, std::string& ref, std::string& alt)
240 {
241     int32_t seq_len;
242     while (ref.at(ref.size()-1)==alt.at(alt.size()-1) && pos1>1)
243     {
244         char base = rs->fetch_base(chrom, pos1-2);
245         ref.erase(ref.size()-1,1);
246         alt.erase(alt.size()-1,1);
247         ref.insert(0, 1, base);
248         alt.insert(0, 1, base);
249         --pos1;
250     }
251 }
252 
253 /**
254  * Right align alleles.
255  */
right_align(std::string & chrom,int32_t & pos1,std::string & ref,std::string & alt)256 void CandidateRegionExtractor::right_align(std::string& chrom, int32_t& pos1, std::string& ref, std::string& alt)
257 {
258     while (ref.at(0)==alt.at(0))
259     {
260         char base = rs->fetch_base(chrom, pos1+1);
261         ref.erase(0,1);
262         alt.erase(0,1);
263         ref.push_back(base);
264         alt.push_back(base);
265         ++pos1;
266     }
267 }
268 
269 /**
270  * Extract reference sequence region for motif discovery in a fuzzy fashion.
271  */
extract_regions_by_fuzzy_alignment(Variant & variant)272 void CandidateRegionExtractor::extract_regions_by_fuzzy_alignment(Variant& variant)
273 {
274     if (debug)
275     {
276         if (debug) std::cerr << "********************************************\n";
277         std::cerr << "EXTRACTIING REGION BY FUZZY ALIGNMENT\n\n";
278     }
279 
280     bcf_hdr_t* h = variant.h;
281     bcf1_t* v = variant.v;
282     std::string&  chrom = variant.chrom;
283 
284     int32_t min_beg1 = bcf_get_pos1(v);
285     int32_t max_end1 = min_beg1;
286 
287     //merge candidate search region
288     for (size_t i=1; i<bcf_get_n_allele(v); ++i)
289     {
290         std::string ref(bcf_get_alt(v, 0));
291         std::string alt(bcf_get_alt(v, i));
292         int32_t pos1 = bcf_get_pos1(v);
293 
294         trim(pos1, ref, alt);
295 
296         if (debug)
297         {
298             std::cerr << "indel fragment : " << (ref.size()<alt.size()? alt : ref) << "\n";
299             std::cerr << "               : " << ref << ":" << alt << "\n";
300         }
301 
302         min_beg1 = fuzzy_left_align(chrom, pos1, ref, alt, 3);
303         max_end1 = fuzzy_right_align(chrom, pos1 + ref.size() - 1, ref, alt, 3);
304 
305         char* seq = rs->fetch_seq(chrom, min_beg1-1, max_end1-1);
306         if (debug)
307         {
308             std::cerr << "FUZZY REGION " << min_beg1 << "-" << max_end1 << " (" << max_end1-min_beg1+1 <<") " << "\n";
309             std::cerr << "             " << seq << "\n";
310         }
311 
312         if (seq) free(seq);
313     }
314 
315     char* seq = rs->fetch_seq(chrom, min_beg1-1, max_end1-1);
316 
317     if (debug)
318     {
319         std::cerr << "FINAL FUZZY REGION " << min_beg1 << "-" << max_end1 << " (" << max_end1-min_beg1+1 <<") " << "\n";
320         std::cerr << "                   " << seq << "\n";
321     }
322 
323     VNTR& vntr = variant.vntr;
324     vntr.exact_repeat_tract.assign(seq);
325     vntr.exact_beg1 = min_beg1;
326 
327     if (seq) free(seq);
328 }
329 
330 /**
331  * Fuzzy left align alleles allowing for mismatches and indels defined by penalty.
332  *
333  * @chrom   - chromosome
334  * @pos1    - 1 based position
335  * @ref     - reference sequence
336  * @alt     - alternative sequence
337  * @penalty - mismatch/indels allowed
338  *
339  * Returns left aligned position.
340  */
fuzzy_left_align(std::string & chrom,int32_t pos1,std::string ref,std::string alt,uint32_t penalty)341 uint32_t CandidateRegionExtractor::fuzzy_left_align(std::string& chrom, int32_t pos1, std::string ref, std::string alt, uint32_t penalty)
342 {
343     if (ref==alt)
344     {
345         return pos1;
346     }
347 
348     //std::cerr << "fuzzy left alignment: " << chrom << ":" << pos1 << ":" << ref << ":" << alt << " (" <<  penalty << ")\n";
349     while (ref.at(ref.size()-1)==alt.at(alt.size()-1) && pos1>1)
350     {
351         char base = rs->fetch_base(chrom, pos1-2);
352         ref.erase(ref.size()-1,1);
353         alt.erase(alt.size()-1,1);
354         ref.insert(0, 1, base);
355         alt.insert(0, 1, base);
356         --pos1;
357     }
358 
359     if (penalty)
360     {
361         uint32_t pos1_sub = pos1;
362         uint32_t pos1_del = pos1;
363         uint32_t pos1_ins = pos1;
364 
365         //substitution
366         char base = rs->fetch_base(chrom, pos1-2);
367         std::string new_ref = ref;
368         std::string new_alt = alt;
369         new_ref.erase(new_ref.size()-1,1);
370         new_alt.erase(new_alt.size()-1,1);
371         new_ref.insert(0, 1, base);
372         new_alt.insert(0, 1, base);
373 //      std::cerr << "\tsub: " << chrom << ":" << pos1-1 << ":" << new_ref << ":" << new_alt << " (" <<  penalty-1 << ")\n";
374         pos1_sub = fuzzy_left_align(chrom, pos1-1, new_ref, new_alt, penalty-1);
375 
376         //deletion
377         if (ref.size()>1)
378         {
379             std::string new_ref = ref;
380             new_ref.erase(new_ref.size()-1,1);
381 //            std::cerr << "\tdel: " << chrom << ":" << pos1 << ":" << new_ref << ":" << alt << " (" <<  penalty-1 << ")\n";
382             pos1_del = fuzzy_left_align(chrom, pos1, new_ref, alt, penalty-1);
383         }
384 
385         //insertion
386         if (alt.size()>1)
387         {
388             std::string new_alt = alt;
389             new_alt.erase(new_alt.size()-1,1);
390 //            std::cerr << "\tins: " << chrom << ":" << pos1 << ":" << ref << ":" << new_alt << " (" <<  penalty-1 << ")\n";
391             pos1_ins = fuzzy_left_align(chrom, pos1, ref, new_alt, penalty-1);
392         }
393 
394         pos1 = std::min(pos1_sub, std::min(pos1_del, pos1_ins));
395     }
396 
397     return pos1;
398 }
399 
400 /**
401  * Fuzzy right align alleles allowing for mismatches and indels defined by penalty.
402  *
403  * @chrom   - chromosome
404  * @pos1    - 1 based position
405  * @ref     - reference sequence
406  * @alt     - alternative sequence
407  * @penalty - mismatch/indels allowed
408  *
409  * Returns right aligned position.
410  */
fuzzy_right_align(std::string & chrom,int32_t pos1,std::string ref,std::string alt,uint32_t penalty)411 uint32_t CandidateRegionExtractor::fuzzy_right_align(std::string& chrom, int32_t pos1, std::string ref, std::string alt, uint32_t penalty)
412 {
413     if (ref==alt)
414     {
415         return pos1;
416     }
417 
418     while (ref.at(0)==alt.at(0))
419     {
420         char base = rs->fetch_base(chrom, pos1);
421         ref.erase(0,1);
422         alt.erase(0,1);
423         ref.push_back(base);
424         alt.push_back(base);
425         ++pos1;
426     }
427 
428     if (penalty)
429     {
430         uint32_t pos1_sub = pos1;
431         uint32_t pos1_del = pos1;
432         uint32_t pos1_ins = pos1;
433 
434         //substitution
435         char base = rs->fetch_base(chrom, pos1);
436         std::string new_ref = ref;
437         std::string new_alt = alt;
438         new_ref.erase(0,1);
439         new_alt.erase(0,1);
440         new_ref.push_back(base);
441         new_alt.push_back(base);
442         //std::cerr << "\tsub: " << chrom << ":" << pos1+1 << ":" << new_ref << ":" << new_alt << " (" <<  penalty-1 << ")\n";
443         pos1_sub = fuzzy_right_align(chrom, pos1+1, new_ref, new_alt, penalty-1);
444 
445         //deletion
446         if (ref.size()>1)
447         {
448             std::string new_ref = ref;
449             new_ref.erase(0,1);
450             //std::cerr << "\tdel: " << chrom << ":" << pos1 << ":" << new_ref << ":" << alt << " (" <<  penalty-1 << ")\n";
451             pos1_del = fuzzy_right_align(chrom, pos1, new_ref, alt, penalty-1);
452         }
453 
454         //insertion
455         if (alt.size()>1)
456         {
457             std::string new_alt = alt;
458             new_alt.erase(0,1);
459             //std::cerr << "\tins: " << chrom << ":" << pos1 << ":" << ref << ":" << new_alt << " (" <<  penalty-1 << ")\n";
460             pos1_ins = fuzzy_right_align(chrom, pos1, ref, new_alt, penalty-1);
461         }
462 
463         pos1 = std::max(pos1_sub, std::max(pos1_del, pos1_ins));
464     }
465 
466     return pos1;
467 }
468 
469 /**
470  * Trim alleles.
471  */
trim(int32_t & pos1,std::string & ref,std::string & alt)472 void CandidateRegionExtractor::trim(int32_t& pos1, std::string& ref, std::string& alt)
473 {
474     while (true)
475     {
476         if (ref.size()==1 || alt.size()==1)
477         {
478             break;
479         }
480         else if (ref.at(0)!=alt.at(0) && ref.at(ref.size()-1)!=alt.at(alt.size()-1))
481         {
482             break;
483         }
484         else
485         {
486             //trim from the right side
487             if (ref.at(ref.size()-1)==alt.at(alt.size()-1))
488             {
489                 ref.erase(ref.size()-1,1);
490                 alt.erase(alt.size()-1,1);
491             }
492             //trim from the left side
493             else if (ref.at(0)==alt.at(0))
494             {
495                 ref.erase(0,1);
496                 alt.erase(0,1);
497                 ++pos1;
498             }
499 
500             //we choose one side to trim at a time to ensure that we do not accidentally end up with an empty allele
501         }
502     }
503 }