1 /* The MIT License
2
3 Copyright (c) 2015 Adrian Tan <atks@umich.edu>
4
5 Permission is hereby granted, free of charge, to any person obtaining a copy
6 of this software and associated documentation files (the "Software"), to deal
7 in the Software without restriction, including without limitation the rights
8 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 copies of the Software, and to permit persons to whom the Software is
10 furnished to do so, subject to the following conditions:
11
12 The above copyright notice and this permission notice shall be included in
13 all copies or substantial portions of the Software.
14
15 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 THE SOFTWARE.
22 */
23
24 #include "flank_detector.h"
25
26 /**
27 * Constructor.
28 */
FlankDetector(std::string & ref_fasta_file,bool debug)29 FlankDetector::FlankDetector(std::string& ref_fasta_file, bool debug)
30 {
31 //////////////////////
32 //initialize variables
33 //////////////////////
34 this->debug = debug;
35
36 ///////////////////
37 //initialize raHMMs
38 ///////////////////
39 // float delta = 0.0001;
40 // float epsilon = 0.0005;
41 // float tau = 0.01;
42 // float eta = 0.01;
43 // float mismatch_penalty = 3;
44 float delta = 0.0000001;
45 float epsilon = 0.0000001;
46 float tau = 0.01;
47 float eta = 0.01;
48 float mismatch_penalty = 5;
49
50 ahmm = new AHMM(false);
51 ahmm->set_delta(delta);
52 ahmm->set_epsilon(epsilon);
53 ahmm->set_tau(tau);
54 ahmm->set_eta(eta);
55 ahmm->set_mismatch_penalty(mismatch_penalty);
56 ahmm->initialize_T();
57
58 lfhmm = new LFHMM(false);
59 lfhmm->set_delta(delta);
60 lfhmm->set_epsilon(epsilon);
61 lfhmm->set_tau(tau);
62 lfhmm->set_eta(eta);
63 lfhmm->set_mismatch_penalty(mismatch_penalty);
64 lfhmm->initialize_T();
65
66 rfhmm = new RFHMM(false);
67 rfhmm->set_delta(delta);
68 rfhmm->set_epsilon(epsilon);
69 rfhmm->set_tau(tau);
70 rfhmm->set_eta(eta);
71 rfhmm->set_mismatch_penalty(mismatch_penalty);
72 rfhmm->initialize_T();
73
74 qual.assign(1024, 'K');
75
76 //////////////////
77 //initialize tools
78 //////////////////
79 rs = new ReferenceSequence(ref_fasta_file);
80 };
81
82 /**
83 * Destructor.
84 */
~FlankDetector()85 FlankDetector::~FlankDetector()
86 {
87 delete rs;
88 }
89
90 /**
91 * Detect repeat region.
92 *
93 * updates
94 * beg1
95 * end1
96 * repeat_tract
97 */
detect_flanks(Variant & variant,uint32_t mode)98 void FlankDetector::detect_flanks(Variant& variant, uint32_t mode)
99 {
100 VNTR& vntr = variant.vntr;
101
102 //simple single base pair clipping of ends
103 if (mode==EXACT)
104 {
105 if (debug)
106 {
107 std::cerr << "********************************************\n";
108 std::cerr << "EXACT\n";
109 std:: cerr << "\n";
110 }
111
112 if (debug)
113 {
114 std::cerr << "++++++++++++++++++++++++++++++++++++++++++++\n";
115 std::cerr << "Detect and remove anchor base\n";
116 }
117
118 if (vntr.exact_repeat_tract.size()>2)
119 {
120 //removing the anchor bases
121 if (vntr.mlen==1)
122 {
123 int32_t offset = 0;
124 int32_t length = vntr.exact_repeat_tract.size();
125 if (vntr.exact_repeat_tract.at(0)!=vntr.motif.at(0))
126 {
127 offset = 1;
128 ++vntr.exact_beg1;
129 }
130
131 if (vntr.exact_repeat_tract.at(vntr.exact_repeat_tract.size()-1)!=vntr.motif.at(0))
132 {
133 length -= offset+1;
134 --vntr.exact_end1;
135 }
136
137 vntr.exact_repeat_tract = vntr.exact_repeat_tract.substr(offset, length);
138 }
139 else
140 {
141 //this works only for simple indels
142 if (vntr.exact_repeat_tract.size()>=3)
143 {
144 vntr.exact_repeat_tract = vntr.exact_repeat_tract.substr(1, vntr.exact_repeat_tract.size()-2);
145 ++vntr.exact_beg1;
146 --vntr.exact_end1;
147 }
148 }
149 }
150 //this is for nonexistent repeat units
151 // RU : T
152 // repeat_tract : G[T]C where T is an insert
153 else if (vntr.exact_repeat_tract.size()==2)
154 {
155
156 }
157
158 if (debug)
159 {
160 std::cerr << "\n";
161 std::cerr << "repeat_tract : " << vntr.exact_repeat_tract << "\n";
162 std::cerr << "position : [" << vntr.exact_beg1 << "," << vntr.exact_end1 << "]\n";
163 std::cerr << "score : " << vntr.exact_score << "\n";
164 std::cerr << "trf score : " << vntr.exact_trf_score << "\n";
165 std::cerr << "repeat units : " << vntr.exact_rl << "\n";
166 std::cerr << "perfect repeat units : " << vntr.exact_no_perfect_ru << "\n";
167 std::cerr << "no. of repeat units : " << vntr.exact_no_ru << "\n";
168 std::cerr << "\n";
169 }
170 }
171 else if (mode==FUZZY)
172 {
173 if (debug)
174 {
175 std::cerr << "********************************************\n";
176 std::cerr << "DETECTING REPEAT TRACT FUZZILY\n";
177 }
178
179 ///////////////////////
180 //fuzzy right alignment
181 ///////////////////////
182 if (debug)
183 {
184 std::cerr << "++++++++++++++++++++++++++++++++++++++++++++\n";
185 std::cerr << "Fuzzy right alignment\n";
186 }
187
188 int32_t slen = 100;
189
190 std::string rflank;
191 std::string lflank;
192
193 int32_t lflank_end1;
194 int32_t rflank_beg1;
195
196 std::string seq;
197 int32_t seq_len;
198
199 while (true)
200 {
201 //fetch sequences for modeling
202 rs->fetch_seq(variant.chrom, vntr.exact_end1+1, vntr.exact_end1+5, rflank);
203 rs->fetch_seq(variant.chrom, vntr.exact_end1-slen, vntr.exact_end1, seq);
204 vntr.fuzzy_ru = choose_fuzzy_3prime_repeat_unit(seq, vntr.fuzzy_motif);
205 seq.append(rflank);
206
207 rfhmm->set_model(vntr.fuzzy_ru.c_str(), rflank.c_str());
208 rfhmm->align(seq.c_str(), qual.c_str());
209 if (debug) rfhmm->print_alignment();
210
211 //////////////////////
212 //fuzzy left alignment
213 //////////////////////
214 if (debug)
215 {
216 std::cerr << "\n";
217 std::cerr << "++++++++++++++++++++++++++++++++++++++++++++\n";
218 std::cerr << "Fuzzy left alignment\n";
219 }
220
221 //this is a hack around rfhmm rigidity in modeling the RUs
222 //todo: we should change this to a reverse version of LFHMM!!!!
223 if (rfhmm->get_lflank_read_epos1()>std::min((int32_t)(10*vntr.fuzzy_ru.size()), 50))
224 {
225 lflank_end1 = vntr.exact_end1-slen-1+1 + rfhmm->get_lflank_read_epos1() - 1;
226 break;
227 }
228 else if (slen==1000)
229 {
230 lflank_end1 = vntr.exact_end1 - 1000 - 1;
231 vntr.is_large_repeat_tract = true;
232 break;
233 }
234 else
235 {
236 slen +=100;
237
238 if (debug)
239 std::cerr << "extending the reference sequence for RFHMM : " << slen << "\n";
240 }
241 }
242
243 // slen = 100;
244
245 //pick 5 bases to right
246 while(true)
247 {
248 //fetch sequences for modeling
249 rs->fetch_seq(variant.chrom, lflank_end1-5+1, lflank_end1, lflank);
250 rs->fetch_seq(variant.chrom, lflank_end1+1, lflank_end1+slen, seq);
251 vntr.fuzzy_ru = choose_fuzzy_5prime_repeat_unit(seq, vntr.fuzzy_motif);
252 seq = lflank + seq;
253
254 lfhmm->set_model(lflank.c_str(), vntr.fuzzy_ru.c_str());
255 lfhmm->align(seq.c_str(), qual.c_str());
256 if (debug) lfhmm->print_alignment();
257
258 // if (lfhmm->get_rflank_read_epos1()!=INT32_MAX ||
259 if (lfhmm->get_rflank_read_spos1()<slen-std::min((int32_t)(10*vntr.fuzzy_ru.size()), 50))
260 {
261 rflank_beg1 = lflank_end1 - 5 + lfhmm->get_rflank_read_spos1();
262 break;
263 }
264 else if (slen==1000)
265 {
266 rflank_beg1 = lflank_end1 + 1000;
267 vntr.is_large_repeat_tract = true;
268 break;
269 }
270 else
271 {
272 slen +=100;
273 if (debug)
274 std::cerr << "extending the reference sequence for LFHMM : " << slen << "\n";
275 }
276 }
277
278 vntr.fuzzy_beg1 = lflank_end1+1;
279 vntr.fuzzy_end1 = rflank_beg1-1;
280 rs->fetch_seq(variant.chrom, vntr.fuzzy_beg1, vntr.fuzzy_end1, vntr.fuzzy_repeat_tract);
281 }
282 };
283
284 /**
285 * Shifts a string.
286 */
shift_str(std::string & seq,uint32_t i)287 std::string FlankDetector::shift_str(std::string& seq, uint32_t i)
288 {
289 std::string sseq = seq;
290 if (i)
291 {
292 sseq = seq.substr(i) + seq.substr(0,i);
293 }
294
295 return sseq;
296 }
297
298 /**
299 * Score a string based on complete alignment.
300 * This is for the purpose of computing a motif score.
301 * It might make sense to incorporate alignment in a limited sense.
302 */
compute_score(int32_t start,int32_t len,std::string & a,std::string & b)303 int32_t FlankDetector::compute_score(int32_t start, int32_t len, std::string& a, std::string& b)
304 {
305 // std::cerr << "a: " << a << "\n";
306 // std::cerr << "b: " << b << "\n";
307 // std::cerr << "start: " << start << "\n";
308 // std::cerr << "len: " << len << "\n";
309
310 if (len>b.size())
311 {
312 len = b.size();
313 }
314
315 int32_t score = 0;
316 int32_t i = start;
317 int32_t j = 0;
318 while (j<len)
319 {
320 if (a.at(i)==b.at(j))
321 {
322 score += 2;
323 }
324 else
325 {
326 score -=7;
327 }
328
329 ++i;
330 ++j;
331 }
332
333 return score;
334 }
335
336 /**
337 * Chooses a phase of the motif that is appropriate for the alignment from the 5 prime end.
338 * This differs from choose_exact_repeat_unit() where the motif is returned
339 * if not suitable repeat unit is found.
340 */
choose_5prime_repeat_unit(std::string & seq,std::string & motif)341 std::string FlankDetector::choose_5prime_repeat_unit(std::string& seq, std::string& motif)
342 {
343 // std::cerr << "seq : " << seq << "\n";
344 // std::cerr << "motif : " << motif << "\n";
345
346 int32_t best_score = -10000;
347 std::string best_motif = motif;
348 int32_t max_score = 2*motif.size();
349
350 int32_t mlen = motif.size();
351 for (uint32_t i=0; i<motif.size(); ++i)
352 {
353 std::string smotif = shift_str(motif, i);
354
355 if (seq.compare(0, mlen, smotif)==0)
356 {
357 return smotif;
358 }
359
360 if (seq.compare(0, mlen, smotif)==0)
361 {
362 return smotif;
363 }
364
365 std::string rc_smotif = VNTR::reverse_complement(smotif);
366 if (seq.compare(0, mlen, rc_smotif)==0)
367 {
368 return rc_smotif;
369 }
370 }
371
372 //if no exact match, perform best fit.
373 //use a priority queue here
374
375 return motif;
376 }
377
378 /**
379 * Chooses a phase of the motif that is appropriate for the alignment from the 5 prime end.
380 * If no exact match is available, the best possible match is returned.
381 */
choose_fuzzy_5prime_repeat_unit(std::string & seq,std::string & motif)382 std::string FlankDetector::choose_fuzzy_5prime_repeat_unit(std::string& seq, std::string& motif)
383 {
384 int32_t best_score = -10000;
385 std::string best_motif = motif;
386 int32_t max_score = 2*motif.size();
387
388 int32_t mlen = motif.size();
389 for (uint32_t i=0; i<motif.size(); ++i)
390 {
391 std::string smotif = shift_str(motif, i);
392 int32_t sscore = compute_score(0, mlen, seq, smotif);
393 if (sscore==max_score)
394 {
395 return smotif;
396 }
397 else if (sscore>best_score)
398 {
399 best_score = sscore;
400 best_motif = smotif;
401 }
402
403 std::string rc_smotif = VNTR::reverse_complement(smotif);
404 int32_t rc_score = compute_score(0, mlen, seq, rc_smotif);
405 if (rc_score==max_score)
406 {
407 return rc_smotif;
408 }
409 else if (rc_score>best_score)
410 {
411 best_score = rc_score;
412 best_motif = rc_smotif;
413 }
414 }
415
416 return best_motif;
417 }
418
419 /**
420 * Chooses a phase of the motif that is appropriate for the alignment from the 3 prime end.
421 * This differs from choose_exact_repeat_unit() where the motif is returned
422 * if not suitable repeat unit is found.
423 */
choose_3prime_repeat_unit(std::string & seq,std::string & motif)424 std::string FlankDetector::choose_3prime_repeat_unit(std::string& seq, std::string& motif)
425 {
426 int32_t mlen = motif.size();
427 for (uint32_t i=0; i<motif.size(); ++i)
428 {
429 std::string smotif = shift_str(motif, i);
430 if (seq.compare(seq.size()-mlen, mlen, smotif)==0)
431 {
432 return smotif;
433 }
434
435 std::string rc_smotif = VNTR::reverse_complement(smotif);
436 if (seq.compare(seq.size()-mlen, mlen, rc_smotif)==0)
437 {
438 return rc_smotif;
439 }
440 }
441
442 return motif;
443 }
444
445 /**
446 * Chooses a phase of the motif that is appropriate for the alignment from the 3 prime end.
447 * If no exact match is available, the best possible match is returned.
448 */
choose_fuzzy_3prime_repeat_unit(std::string & seq,std::string & motif)449 std::string FlankDetector::choose_fuzzy_3prime_repeat_unit(std::string& seq, std::string& motif)
450 {
451 int32_t best_score = -10000;
452 std::string best_motif = motif;
453 int32_t max_score = 2*motif.size();
454
455 int32_t mlen = motif.size();
456 for (uint32_t i=0; i<motif.size(); ++i)
457 {
458 std::string smotif = shift_str(motif, i);
459 int32_t sscore = compute_score(seq.size()-mlen, mlen, seq, smotif);
460 if (sscore==max_score)
461 {
462 return smotif;
463 }
464 else if (sscore>best_score)
465 {
466 best_score = sscore;
467 best_motif = smotif;
468 }
469
470 std::string rc_smotif = VNTR::reverse_complement(smotif);
471 int32_t rc_score = compute_score(seq.size()-mlen, mlen, seq, rc_smotif);
472 if (rc_score==max_score)
473 {
474 return rc_smotif;
475 }
476 else if (rc_score>best_score)
477 {
478 best_score = rc_score;
479 best_motif = rc_smotif;
480 }
481 }
482
483 return best_motif;
484 }
485
486 /**
487 * Chooses a phase of the motif that is appropriate for the alignment.
488 * This returns the empty string if the motif does not have an exact
489 * match in all its phases.
490 */
choose_exact_repeat_unit(std::string & seq,std::string & motif)491 std::string FlankDetector::choose_exact_repeat_unit(std::string& seq, std::string& motif)
492 {
493 // if (debug)
494 // {
495 // std::cerr << "choose_repeat_unit\n";
496 // std::cerr << "\tseq " << seq << "\n";
497 // std::cerr << "\tmotif " << motif << "\n";
498 // }
499
500 for (uint32_t i=0; i<motif.size(); ++i)
501 {
502 std::string smotif = shift_str(motif, i);
503
504 if (seq.compare(0, smotif.size(), smotif)==0)
505 {
506 return smotif;
507 }
508 }
509
510 return "";
511 }
512
513 /**
514 * Polish repeat tract ends.
515 */
polish_repeat_tract_ends(Variant & variant)516 void FlankDetector::polish_repeat_tract_ends(Variant& variant)
517 {
518 }
519
520 /**
521 * Polish repeat tract ends.
522 *
523 *
524 */
polish_repeat_tract_ends(std::string & repeat_tract,std::string & motif,bool debug)525 void FlankDetector::polish_repeat_tract_ends(std::string& repeat_tract, std::string& motif, bool debug)
526 {
527 if (debug)
528 {
529 std::cerr << "===================\n";
530 std::cerr << "Polish repeat tract\n";
531 std::cerr << "===================\n";
532 std::cerr << "repeat tract : " << repeat_tract << " (" << repeat_tract.size() << ")\n";
533 std::cerr << "motif : " << motif << "\n";
534 }
535
536 min_beg0 = repeat_tract.size();
537 max_end0 = -1;
538 int32_t mlen = motif.size();
539 int32_t rlen = repeat_tract.size();
540
541 //todo: we can use a FSA for substring matching.
542 //is there a way check all phases simultaneously?
543 for (uint32_t i = 0; i<mlen; ++i)
544 {
545 std::string smotif = shift_str(motif, i);
546
547 int32_t temp_min_beg0 = 0;
548 int32_t temp_max_end0 = 0;
549
550 for (int32_t j=0; j<rlen; ++j)
551 {
552 if (repeat_tract.compare(j, mlen, smotif)==0)
553 {
554 temp_min_beg0 = j;
555 min_beg0 = std::min(j, min_beg0);
556 break;
557 }
558 }
559
560 for (int32_t j=rlen-mlen; j>=0; --j)
561 {
562 if (repeat_tract.compare(j, mlen, smotif)==0)
563 {
564 temp_max_end0 = j+mlen-1;
565 max_end0 = std::max(j+mlen-1, max_end0);
566 break;
567 }
568 }
569
570 if (debug) std::cerr << "\t" << smotif << " " << temp_min_beg0 << " " << temp_max_end0 << "\n";
571 }
572
573 polished_repeat_tract = repeat_tract.substr(min_beg0, max_end0-min_beg0+1);
574
575 if (debug)
576 {
577 std::cerr << "min beg : " << min_beg0 << "\n";
578 std::cerr << "max end : " << max_end0 << "\n";
579 std::cerr << "polished : " << polished_repeat_tract << "\n";
580 }
581 }
582
583 /**
584 * Computes purity score of a sequence with respect to a motif.
585 *
586 * updates
587 * 1. ru
588 * 2. score
589 * 3. trf_score
590 * 4. no_exact_ru
591 * 5. total_no_ru
592 * 6. ref
593 * 7. rl
594 * 8. ll
595 */
compute_purity_score(Variant & variant,int32_t amode)596 void FlankDetector::compute_purity_score(Variant& variant, int32_t amode)
597 {
598 VNTR& vntr = variant.vntr;
599
600 if (amode&FINAL)
601 {
602 compute_purity_score(vntr.repeat_tract, vntr.ru);
603
604 vntr.ru = ru;
605 vntr.score = score;
606 vntr.trf_score = trf_score;
607 vntr.no_perfect_ru = no_perfect_ru;
608 vntr.no_ru = no_ru;
609 vntr.ref = ref;
610 vntr.rl = rl;
611 vntr.ll = rl + variant.max_dlen;
612 }
613
614 if (amode&EXACT)
615 {
616 compute_purity_score(vntr.exact_repeat_tract, vntr.exact_ru);
617
618 vntr.exact_ru = ru;
619 vntr.exact_score = score;
620 vntr.exact_trf_score = trf_score;
621 vntr.exact_no_perfect_ru = no_perfect_ru;
622 vntr.exact_no_ru = no_ru;
623 vntr.exact_ref = ref;
624 vntr.exact_rl = rl;
625 vntr.exact_ll = rl + variant.max_dlen;
626 }
627
628 if (amode&FUZZY)
629 {
630 compute_purity_score(vntr.fuzzy_repeat_tract, vntr.fuzzy_ru);
631
632 vntr.fuzzy_ru = ru;
633 vntr.fuzzy_score = score;
634 vntr.fuzzy_trf_score = trf_score;
635 vntr.fuzzy_no_perfect_ru = no_perfect_ru;
636 vntr.fuzzy_no_ru = no_ru;
637 vntr.fuzzy_ref = ref;
638 vntr.fuzzy_rl = rl;
639 vntr.fuzzy_ll = rl + variant.max_dlen;
640 }
641 }
642
643 /**
644 * Computes purity score of a sequence with respect to a motif.
645 */
compute_purity_score(std::string & repeat_tract,std::string & motif)646 void FlankDetector::compute_purity_score(std::string& repeat_tract, std::string& motif)
647 {
648 ru = choose_exact_repeat_unit(repeat_tract, motif);
649
650 if (debug)
651 {
652 std::cerr << "ru " << ru << "\n";
653 std::cerr << "repeat tract " << repeat_tract << "\n";
654
655 }
656
657 ///////////////////
658 //exact calculation
659 ///////////////////
660 if (ru!="")
661 {
662 uint32_t mlen = ru.size();
663 uint32_t j=0;
664 bool exact = true;
665 for (uint32_t i=0; i<repeat_tract.size(); ++i)
666 {
667 if (ru.at(j)!=repeat_tract.at(i))
668 {
669 exact = false;
670 break;
671 }
672
673 j = (j==mlen-1) ? 0 : j+1;
674 }
675
676 if (exact)
677 {
678 score = 1;
679 no_perfect_ru = repeat_tract.size()/motif.size();
680 no_ru = no_perfect_ru;
681 ref = (float) repeat_tract.size()/motif.size();
682 rl = repeat_tract.size();
683 trf_score = repeat_tract.size() << 1;
684 return; //done!
685 }
686 }
687
688 ///////////////////
689 //fuzzy calculation
690 ///////////////////
691 ru = motif;
692
693 if (ru.size()>ahmm->max_len)
694 {
695 //compute by chunks
696 //todo:: not the best way. update with a localized aligner
697
698 }
699 else
700 {
701 ahmm->set_model(ru.c_str());
702 ahmm->align(repeat_tract.c_str(), qual.c_str());
703
704 score = ahmm->motif_concordance;
705 score = std::round(100*score)/100;
706 no_perfect_ru = ahmm->exact_motif_count;
707 no_ru = ahmm->motif_count;
708 ref = ahmm->frac_no_repeats;
709 rl = repeat_tract.size();
710 trf_score = ahmm->trf_score;
711 }
712 }
713
714 /**
715 * Computes composition and entropy ofrepeat tract.
716 *
717 * updates
718 * 1. comp
719 * 2. entropy
720 * 3. entropy2
721 * 4. kl_divergence
722 * 5. kl_divergence2
723 */
compute_composition_and_entropy(Variant & variant,int32_t amode)724 void FlankDetector::compute_composition_and_entropy(Variant& variant, int32_t amode)
725 {
726 VNTR& vntr = variant.vntr;
727
728 if (amode&FINAL)
729 {
730 compute_composition_and_entropy(vntr.repeat_tract);
731 vntr.comp[0] = comp[0];
732 vntr.comp[1] = comp[1];
733 vntr.comp[2] = comp[2];
734 vntr.comp[3] = comp[3];
735
736 vntr.entropy = entropy;
737 vntr.entropy2 = entropy2;
738 vntr.kl_divergence = kl_divergence;
739 vntr.kl_divergence2 = kl_divergence2;
740 }
741 else if (amode&EXACT)
742 {
743 compute_composition_and_entropy(vntr.exact_repeat_tract);
744 vntr.exact_comp[0] = comp[0];
745 vntr.exact_comp[1] = comp[1];
746 vntr.exact_comp[2] = comp[2];
747 vntr.exact_comp[3] = comp[3];
748
749 vntr.exact_entropy = entropy;
750 vntr.exact_entropy2 = entropy2;
751 vntr.exact_kl_divergence = kl_divergence;
752 vntr.exact_kl_divergence2 = kl_divergence2;
753 }
754 else if (amode&FUZZY)
755 {
756 compute_composition_and_entropy(vntr.fuzzy_repeat_tract);
757 vntr.fuzzy_comp[0] = comp[0];
758 vntr.fuzzy_comp[1] = comp[1];
759 vntr.fuzzy_comp[2] = comp[2];
760 vntr.fuzzy_comp[3] = comp[3];
761 vntr.fuzzy_entropy = entropy;
762 vntr.fuzzy_entropy2 = entropy2;
763 vntr.fuzzy_kl_divergence = kl_divergence;
764 vntr.fuzzy_kl_divergence2 = kl_divergence2;
765 }
766 }
767
768 /**
769 * Computes composition and entropy ofrepeat tract.
770 */
compute_composition_and_entropy(std::string & repeat_tract)771 void FlankDetector::compute_composition_and_entropy(std::string& repeat_tract)
772 {
773 int32_t aux_comp[4] = {0,0,0,0};
774 int32_t aux_comp2[16] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
775 int32_t b2i[10] = {0,1,0,2,0,0,0,0,0,3};
776
777 // ACGT x ACGT
778 // AA - 0 = 0*0 + 0
779 // AC - 1
780 // AG - 2
781 // AT - 3
782 // CA - 4 = 1*4 + 0 = 1<<2
783 // CC - 5
784 // CG - 6
785 // CT - 7
786 // GA - 8 = 2*4 + 0 = 2<<1 = 10=>100
787 // GC - 9
788 // GG - 10
789 // GT - 11
790 // TA - 12 = 3*4 + 0 = 3<<1 11=>110=>1100
791 // TC - 13
792 // TG - 14
793 // TT - 15 = 3*4+3 = 15
794
795 int32_t n = repeat_tract.size();
796
797 for (uint32_t i=0; i<n; ++i)
798 {
799 uint32_t b1 = b2i[(repeat_tract.at(i)-65)>>1];
800
801 ++aux_comp[b1];
802 if (i<n-1)
803 {
804 uint32_t b2 = b2i[(repeat_tract.at(i+1)-65)>>1];
805 uint32_t bb = (b1<<2) + b2;
806 ++aux_comp2[bb];
807 }
808 }
809
810 float p[4] = {(float) aux_comp[0]/n, (float) aux_comp[1]/n, (float) aux_comp[2]/n, (float) aux_comp[3]/n};
811
812 entropy = 0;
813 if (p[0]) entropy += p[0] * std::log2(p[0]);
814 if (p[1]) entropy += p[1] * std::log2(p[1]);
815 if (p[2]) entropy += p[2] * std::log2(p[2]);
816 if (p[3]) entropy += p[3] * std::log2(p[3]);
817 kl_divergence = p[0] + p[1] + p[2] + p[3];
818 kl_divergence = entropy + 2*kl_divergence;
819 kl_divergence = std::round(100*kl_divergence)/100;
820 kl_divergence = fix_neg_zero(kl_divergence);
821 entropy = -entropy;
822 entropy = std::round(100*entropy)/100;
823 entropy = fix_neg_zero(entropy);
824
825 comp[0] = std::round(p[0] * 100);
826 comp[1] = std::round(p[1] * 100);
827 comp[2] = std::round(p[2] * 100);
828 comp[3] = std::round(p[3] * 100);
829
830 // std::cerr << "tract: " << repeat_tract << "\n";
831 // std::cerr << "A: " << comp[0] << " " << aux_comp[0] << "\n";
832 // std::cerr << "C: " << comp[1] << " " << aux_comp[1] << "\n";
833 // std::cerr << "G: " << comp[2] << " " << aux_comp[2] << "\n";
834 // std::cerr << "T: " << comp[3] << " " << aux_comp[3] << "\n";
835 // std::cerr << "\n";
836 // std::cerr << "entropy : " << entropy << "\n";
837 // std::cerr << "kl_divergence : " << kl_divergence << "\n";
838
839 entropy2 = 0;
840 kl_divergence2 = 0;
841 float log2q = -4;
842 if (n!=1)
843 {
844 float p2[16];
845 for (uint32_t i=0; i<16; ++i)
846 {
847 p2[i] = (float)aux_comp2[i]/(n-1);
848 }
849
850 for (uint32_t i=0; i<16; ++i)
851 {
852 if (p2[i])
853 {
854 entropy2 += p2[i]* std::log2(p2[i]);
855 kl_divergence2 += p2[i];
856 }
857 }
858 kl_divergence2 = entropy2 + 4*kl_divergence2;
859 kl_divergence2 = std::round(100*kl_divergence2)/100;
860 kl_divergence2 = fix_neg_zero(kl_divergence2);
861 entropy2 = -entropy2;
862 entropy2 = std::round(100*entropy2)/100;
863 entropy2 = fix_neg_zero(entropy2);
864
865 // std::cerr << "tract: " << repeat_tract << "\n";
866 // std::cerr << "AA: " << aux_comp2[0] << " " << p2[0] << "\n";
867 // std::cerr << "AC: " << aux_comp2[1] << " " << p2[1] << "\n";
868 // std::cerr << "AG: " << aux_comp2[2] << " " << p2[2] << "\n";
869 // std::cerr << "AT: " << aux_comp2[3] << " " << p2[3] << "\n";
870 // std::cerr << "CA: " << aux_comp2[4] << " " << p2[4] << "\n";
871 // std::cerr << "CC: " << aux_comp2[5] << " " << p2[5] << "\n";
872 // std::cerr << "CG: " << aux_comp2[6] << " " << p2[6] << "\n";
873 // std::cerr << "CT: " << aux_comp2[7] << " " << p2[7] << "\n";
874 // std::cerr << "GA: " << aux_comp2[8] << " " << p2[8] << "\n";
875 // std::cerr << "GC: " << aux_comp2[9] << " " << p2[9] << "\n";
876 // std::cerr << "GG: " << aux_comp2[10] << " " << p2[10] << "\n";
877 // std::cerr << "GT: " << aux_comp2[11] << " " << p2[11] << "\n";
878 // std::cerr << "TA: " << aux_comp2[12] << " " << p2[12] << "\n";
879 // std::cerr << "TC: " << aux_comp2[13] << " " << p2[13] << "\n";
880 // std::cerr << "TG: " << aux_comp2[14] << " " << p2[14] << "\n";
881 // std::cerr << "TT: " << aux_comp2[15] << " " << p2[15] << "\n";
882 // std::cerr << "\n";
883 // std::cerr << "entropy2 : " << entropy2 << "\n";
884 // std::cerr << "kl_divergence2 : " << kl_divergence2 << "\n";
885 }
886
887 }