1 /* The MIT License
2
3 Copyright (c) 2015 Adrian Tan <atks@umich.edu>
4
5 Permission is hereby granted, free of charge, to any person obtaining a copy
6 of this software and associated documentation files (the "Software"), to deal
7 in the Software without restriction, including without limitation the rights
8 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 copies of the Software, and to permit persons to whom the Software is
10 furnished to do so, subject to the following conditions:
11
12 The above copyright notice and this permission notice shall be included in
13 all copies or substantial portions of the Software.
14
15 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 THE SOFTWARE.
22 */
23
24 #include "candidate_region_extractor.h"
25
26 /**
27 * Constructor.
28 */
CandidateRegionExtractor(std::string & ref_fasta_file,bool debug)29 CandidateRegionExtractor::CandidateRegionExtractor(std::string& ref_fasta_file, bool debug)
30 {
31 vm = new VariantManip(ref_fasta_file.c_str());
32 rs = new ReferenceSequence(ref_fasta_file);
33
34 this->debug = debug;
35 };
36
37 /**
38 * Destructor.
39 */
~CandidateRegionExtractor()40 CandidateRegionExtractor::~CandidateRegionExtractor()
41 {
42 delete vm;
43 delete rs;
44 }
45
46 /**
47 * Pick candidate region.
48 *
49 * @mode - REFERENCE use refence field
50 * - ALLELE_EXACT by exact alignment
51 * - ALLELE_FUZZY by fuzzy alignment
52 */
pick_candidate_region(Variant & variant,int32_t mode,int32_t amode)53 void CandidateRegionExtractor::pick_candidate_region(Variant& variant, int32_t mode, int32_t amode)
54 {
55 bcf_hdr_t* h = variant.h;
56 bcf1_t* v = variant.v;
57
58 if (mode==REFERENCE)
59 {
60 if (amode&FINAL)
61 {
62 VNTR& vntr = variant.vntr;
63 vntr.repeat_tract.assign(bcf_get_ref(v));
64 vntr.beg1 = bcf_get_pos1(v);
65 vntr.end1 = bcf_get_end1(v);
66 vntr.rl = vntr.end1-vntr.beg1+1;
67 vntr.ll = vntr.rl; //??
68 }
69
70 if (amode&EXACT)
71 {
72 VNTR& vntr = variant.vntr;
73 vntr.exact_repeat_tract.assign(bcf_get_ref(v));
74 vntr.exact_beg1 = bcf_get_pos1(v);
75 vntr.exact_end1 = bcf_get_end1(v);
76 vntr.exact_rl = vntr.exact_end1-vntr.exact_beg1+1;
77 vntr.exact_ll = vntr.exact_rl; //??
78 }
79
80 if (amode&FUZZY)
81 {
82 VNTR& vntr = variant.vntr;
83 vntr.fuzzy_repeat_tract.assign(bcf_get_ref(v));
84 vntr.fuzzy_beg1 = bcf_get_pos1(v);
85 vntr.fuzzy_end1 = bcf_get_end1(v);
86 vntr.fuzzy_rl = vntr.fuzzy_end1-vntr.fuzzy_beg1+1;
87 vntr.fuzzy_ll = vntr.fuzzy_rl; //??
88 }
89 }
90 else if (mode==EXACT_LEFT_RIGHT_ALIGNMENT)
91 {
92 if (amode==EXACT)
93 {
94 extract_regions_by_exact_alignment(variant);
95 }
96 else
97 {
98 fprintf(stderr, "[E:%s] EXACT_LEFT_RIGHT_ALIGNMENT cannot be updated for any other attribute types beside EXACT.\n", __FUNCTION__);
99 exit(1);
100 }
101 }
102 else if (mode==FUZZY_LEFT_RIGHT_ALIGNMENT)
103 {
104 if (amode==FUZZY)
105 {
106 extract_regions_by_fuzzy_alignment(variant);
107 }
108 else
109 {
110 fprintf(stderr, "[E:%s] EXACT_LEFT_RIGHT_ALIGNMENT cannot be updated for any other attribute types beside FUZZY.\n", __FUNCTION__);
111 exit(1);
112 }
113 }
114 }
115
116 /**
117 * Chooses a phase of the motif that is appropriate for the alignment
118 */
choose_repeat_unit(std::string & ref,std::string & motif)119 std::string CandidateRegionExtractor::choose_repeat_unit(std::string& ref, std::string& motif)
120 {
121 for (uint32_t i=0; i<motif.size(); ++i)
122 {
123 std::string smotif = VNTR::shift_str(motif, i);
124 if (ref.compare(0, smotif.size(), smotif)==0)
125 {
126 return smotif;
127 }
128 }
129
130 return motif;
131 }
132
133 /**
134 * Checks if a vntr is a homopolymer.
135 */
is_homopolymer(bcf_hdr_t * h,bcf1_t * v)136 bool CandidateRegionExtractor::is_homopolymer(bcf_hdr_t* h, bcf1_t* v)
137 {
138 bool is_homopolymer = false;
139 uint32_t ref_len = strlen(bcf_get_ref(v));
140 for (size_t i=1; i<bcf_get_n_allele(v); ++i)
141 {
142 std::string ref(bcf_get_alt(v, 0));
143 std::string alt(bcf_get_alt(v, i));
144 int32_t pos1 = bcf_get_pos1(v);
145 }
146
147 return is_homopolymer;
148 }
149
150 /**
151 * Extract reference sequence region for motif discovery.
152 *
153 * The input is a VCF record that contains an indel.
154 *
155 * If the the indel has multiple alleles, it will examine all
156 * alleles.
157 *
158 * todo: is might be a good idea to combine this step with motif detection
159 * since there seems to be a need to have an iterative process here
160 * to ensure a good candidate motif is chosen. *
161 */
extract_regions_by_exact_alignment(Variant & variant)162 void CandidateRegionExtractor::extract_regions_by_exact_alignment(Variant& variant)
163 {
164 if (debug)
165 {
166 if (debug) std::cerr << "********************************************\n";
167 std::cerr << "EXTRACTIING REGION BY EXACT LEFT AND RIGHT ALIGNMENT\n\n";
168 }
169
170 bcf_hdr_t* h = variant.h;
171 bcf1_t* v = variant.v;
172
173 VNTR& vntr = variant.vntr;
174 std::string& chrom = variant.chrom;
175
176 int32_t min_beg1 = bcf_get_pos1(v);
177 int32_t max_end1 = min_beg1;
178
179 if (debug)
180 {
181 bcf_print_liten(h, v);
182 }
183
184 //merge candidate search region
185 for (size_t i=1; i<bcf_get_n_allele(v); ++i)
186 {
187 std::string ref(bcf_get_alt(v, 0));
188 std::string alt(bcf_get_alt(v, i));
189 int32_t pos1 = bcf_get_pos1(v);
190
191 //this prevents introduction of flanks that do not harbour the repeat unit
192 trim(pos1, ref, alt);
193
194 int32_t end1 = pos1 + ref.size() - 1;
195 std::cerr << end1 << " " << ref << " " << alt << "\n";
196
197 right_align(chrom, end1, ref, alt);
198
199 std::cerr << end1 << " " << ref << " " << alt << "\n";
200
201
202 int32_t beg1 = end1 - ref.size() + 1;
203 left_align(chrom, beg1, ref, alt);
204
205 min_beg1 = beg1<min_beg1 ? beg1 : min_beg1;
206 max_end1 = end1>max_end1 ? end1 : max_end1;
207
208 int32_t seq_len;
209 char* seq = rs->fetch_seq(chrom, min_beg1, max_end1);
210
211 if (debug)
212 {
213 std::cerr << "EXACT REGION " << min_beg1 << "-" << max_end1 << " (" << max_end1-min_beg1+1 <<") from " << pos1 << ":" << ref << ":" << alt << "\n";
214 std::cerr << " " << seq << "\n";
215 }
216
217 if (seq) free(seq);
218 }
219
220 char* seq = rs->fetch_seq(chrom, min_beg1, max_end1);
221
222 if (debug)
223 {
224 std::cerr << "FINAL EXACT REGION " << min_beg1 << "-" << max_end1 << " (" << max_end1-min_beg1+1 <<") " << "\n";
225 std::cerr << " " << seq << "\n";
226 }
227
228 vntr.exact_repeat_tract = seq;
229 vntr.rid = bcf_get_rid(v);
230 vntr.exact_beg1 = min_beg1;
231 vntr.exact_end1 = max_end1;
232
233 if (seq) free(seq);
234 }
235
236 /**
237 * Left align alleles.
238 */
left_align(std::string & chrom,int32_t & pos1,std::string & ref,std::string & alt)239 void CandidateRegionExtractor::left_align(std::string& chrom, int32_t& pos1, std::string& ref, std::string& alt)
240 {
241 int32_t seq_len;
242 while (ref.at(ref.size()-1)==alt.at(alt.size()-1) && pos1>1)
243 {
244 char base = rs->fetch_base(chrom, pos1-2);
245 ref.erase(ref.size()-1,1);
246 alt.erase(alt.size()-1,1);
247 ref.insert(0, 1, base);
248 alt.insert(0, 1, base);
249 --pos1;
250 }
251 }
252
253 /**
254 * Right align alleles.
255 */
right_align(std::string & chrom,int32_t & pos1,std::string & ref,std::string & alt)256 void CandidateRegionExtractor::right_align(std::string& chrom, int32_t& pos1, std::string& ref, std::string& alt)
257 {
258 while (ref.at(0)==alt.at(0))
259 {
260 char base = rs->fetch_base(chrom, pos1+1);
261 ref.erase(0,1);
262 alt.erase(0,1);
263 ref.push_back(base);
264 alt.push_back(base);
265 ++pos1;
266 }
267 }
268
269 /**
270 * Extract reference sequence region for motif discovery in a fuzzy fashion.
271 */
extract_regions_by_fuzzy_alignment(Variant & variant)272 void CandidateRegionExtractor::extract_regions_by_fuzzy_alignment(Variant& variant)
273 {
274 if (debug)
275 {
276 if (debug) std::cerr << "********************************************\n";
277 std::cerr << "EXTRACTIING REGION BY FUZZY ALIGNMENT\n\n";
278 }
279
280 bcf_hdr_t* h = variant.h;
281 bcf1_t* v = variant.v;
282 std::string& chrom = variant.chrom;
283
284 int32_t min_beg1 = bcf_get_pos1(v);
285 int32_t max_end1 = min_beg1;
286
287 //merge candidate search region
288 for (size_t i=1; i<bcf_get_n_allele(v); ++i)
289 {
290 std::string ref(bcf_get_alt(v, 0));
291 std::string alt(bcf_get_alt(v, i));
292 int32_t pos1 = bcf_get_pos1(v);
293
294 trim(pos1, ref, alt);
295
296 if (debug)
297 {
298 std::cerr << "indel fragment : " << (ref.size()<alt.size()? alt : ref) << "\n";
299 std::cerr << " : " << ref << ":" << alt << "\n";
300 }
301
302 min_beg1 = fuzzy_left_align(chrom, pos1, ref, alt, 3);
303 max_end1 = fuzzy_right_align(chrom, pos1 + ref.size() - 1, ref, alt, 3);
304
305 char* seq = rs->fetch_seq(chrom, min_beg1-1, max_end1-1);
306 if (debug)
307 {
308 std::cerr << "FUZZY REGION " << min_beg1 << "-" << max_end1 << " (" << max_end1-min_beg1+1 <<") " << "\n";
309 std::cerr << " " << seq << "\n";
310 }
311
312 if (seq) free(seq);
313 }
314
315 char* seq = rs->fetch_seq(chrom, min_beg1-1, max_end1-1);
316
317 if (debug)
318 {
319 std::cerr << "FINAL FUZZY REGION " << min_beg1 << "-" << max_end1 << " (" << max_end1-min_beg1+1 <<") " << "\n";
320 std::cerr << " " << seq << "\n";
321 }
322
323 VNTR& vntr = variant.vntr;
324 vntr.exact_repeat_tract.assign(seq);
325 vntr.exact_beg1 = min_beg1;
326
327 if (seq) free(seq);
328 }
329
330 /**
331 * Fuzzy left align alleles allowing for mismatches and indels defined by penalty.
332 *
333 * @chrom - chromosome
334 * @pos1 - 1 based position
335 * @ref - reference sequence
336 * @alt - alternative sequence
337 * @penalty - mismatch/indels allowed
338 *
339 * Returns left aligned position.
340 */
fuzzy_left_align(std::string & chrom,int32_t pos1,std::string ref,std::string alt,uint32_t penalty)341 uint32_t CandidateRegionExtractor::fuzzy_left_align(std::string& chrom, int32_t pos1, std::string ref, std::string alt, uint32_t penalty)
342 {
343 if (ref==alt)
344 {
345 return pos1;
346 }
347
348 //std::cerr << "fuzzy left alignment: " << chrom << ":" << pos1 << ":" << ref << ":" << alt << " (" << penalty << ")\n";
349 while (ref.at(ref.size()-1)==alt.at(alt.size()-1) && pos1>1)
350 {
351 char base = rs->fetch_base(chrom, pos1-2);
352 ref.erase(ref.size()-1,1);
353 alt.erase(alt.size()-1,1);
354 ref.insert(0, 1, base);
355 alt.insert(0, 1, base);
356 --pos1;
357 }
358
359 if (penalty)
360 {
361 uint32_t pos1_sub = pos1;
362 uint32_t pos1_del = pos1;
363 uint32_t pos1_ins = pos1;
364
365 //substitution
366 char base = rs->fetch_base(chrom, pos1-2);
367 std::string new_ref = ref;
368 std::string new_alt = alt;
369 new_ref.erase(new_ref.size()-1,1);
370 new_alt.erase(new_alt.size()-1,1);
371 new_ref.insert(0, 1, base);
372 new_alt.insert(0, 1, base);
373 // std::cerr << "\tsub: " << chrom << ":" << pos1-1 << ":" << new_ref << ":" << new_alt << " (" << penalty-1 << ")\n";
374 pos1_sub = fuzzy_left_align(chrom, pos1-1, new_ref, new_alt, penalty-1);
375
376 //deletion
377 if (ref.size()>1)
378 {
379 std::string new_ref = ref;
380 new_ref.erase(new_ref.size()-1,1);
381 // std::cerr << "\tdel: " << chrom << ":" << pos1 << ":" << new_ref << ":" << alt << " (" << penalty-1 << ")\n";
382 pos1_del = fuzzy_left_align(chrom, pos1, new_ref, alt, penalty-1);
383 }
384
385 //insertion
386 if (alt.size()>1)
387 {
388 std::string new_alt = alt;
389 new_alt.erase(new_alt.size()-1,1);
390 // std::cerr << "\tins: " << chrom << ":" << pos1 << ":" << ref << ":" << new_alt << " (" << penalty-1 << ")\n";
391 pos1_ins = fuzzy_left_align(chrom, pos1, ref, new_alt, penalty-1);
392 }
393
394 pos1 = std::min(pos1_sub, std::min(pos1_del, pos1_ins));
395 }
396
397 return pos1;
398 }
399
400 /**
401 * Fuzzy right align alleles allowing for mismatches and indels defined by penalty.
402 *
403 * @chrom - chromosome
404 * @pos1 - 1 based position
405 * @ref - reference sequence
406 * @alt - alternative sequence
407 * @penalty - mismatch/indels allowed
408 *
409 * Returns right aligned position.
410 */
fuzzy_right_align(std::string & chrom,int32_t pos1,std::string ref,std::string alt,uint32_t penalty)411 uint32_t CandidateRegionExtractor::fuzzy_right_align(std::string& chrom, int32_t pos1, std::string ref, std::string alt, uint32_t penalty)
412 {
413 if (ref==alt)
414 {
415 return pos1;
416 }
417
418 while (ref.at(0)==alt.at(0))
419 {
420 char base = rs->fetch_base(chrom, pos1);
421 ref.erase(0,1);
422 alt.erase(0,1);
423 ref.push_back(base);
424 alt.push_back(base);
425 ++pos1;
426 }
427
428 if (penalty)
429 {
430 uint32_t pos1_sub = pos1;
431 uint32_t pos1_del = pos1;
432 uint32_t pos1_ins = pos1;
433
434 //substitution
435 char base = rs->fetch_base(chrom, pos1);
436 std::string new_ref = ref;
437 std::string new_alt = alt;
438 new_ref.erase(0,1);
439 new_alt.erase(0,1);
440 new_ref.push_back(base);
441 new_alt.push_back(base);
442 //std::cerr << "\tsub: " << chrom << ":" << pos1+1 << ":" << new_ref << ":" << new_alt << " (" << penalty-1 << ")\n";
443 pos1_sub = fuzzy_right_align(chrom, pos1+1, new_ref, new_alt, penalty-1);
444
445 //deletion
446 if (ref.size()>1)
447 {
448 std::string new_ref = ref;
449 new_ref.erase(0,1);
450 //std::cerr << "\tdel: " << chrom << ":" << pos1 << ":" << new_ref << ":" << alt << " (" << penalty-1 << ")\n";
451 pos1_del = fuzzy_right_align(chrom, pos1, new_ref, alt, penalty-1);
452 }
453
454 //insertion
455 if (alt.size()>1)
456 {
457 std::string new_alt = alt;
458 new_alt.erase(0,1);
459 //std::cerr << "\tins: " << chrom << ":" << pos1 << ":" << ref << ":" << new_alt << " (" << penalty-1 << ")\n";
460 pos1_ins = fuzzy_right_align(chrom, pos1, ref, new_alt, penalty-1);
461 }
462
463 pos1 = std::max(pos1_sub, std::max(pos1_del, pos1_ins));
464 }
465
466 return pos1;
467 }
468
469 /**
470 * Trim alleles.
471 */
trim(int32_t & pos1,std::string & ref,std::string & alt)472 void CandidateRegionExtractor::trim(int32_t& pos1, std::string& ref, std::string& alt)
473 {
474 while (true)
475 {
476 if (ref.size()==1 || alt.size()==1)
477 {
478 break;
479 }
480 else if (ref.at(0)!=alt.at(0) && ref.at(ref.size()-1)!=alt.at(alt.size()-1))
481 {
482 break;
483 }
484 else
485 {
486 //trim from the right side
487 if (ref.at(ref.size()-1)==alt.at(alt.size()-1))
488 {
489 ref.erase(ref.size()-1,1);
490 alt.erase(alt.size()-1,1);
491 }
492 //trim from the left side
493 else if (ref.at(0)==alt.at(0))
494 {
495 ref.erase(0,1);
496 alt.erase(0,1);
497 ++pos1;
498 }
499
500 //we choose one side to trim at a time to ensure that we do not accidentally end up with an empty allele
501 }
502 }
503 }