1 /* The MIT License
2 
3    Copyright (c) 2015 Adrian Tan <atks@umich.edu>
4 
5    Permission is hereby granted, free of charge, to any person obtaining a copy
6    of this software and associated documentation files (the "Software"), to deal
7    in the Software without restriction, including without limitation the rights
8    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9    copies of the Software, and to permit persons to whom the Software is
10    furnished to do so, subject to the following conditions:
11 
12    The above copyright notice and this permission notice shall be included in
13    all copies or substantial portions of the Software.
14 
15    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21    THE SOFTWARE.
22 */
23 
24 #include "flank_detector.h"
25 
26 /**
27  * Constructor.
28  */
FlankDetector(std::string & ref_fasta_file,bool debug)29 FlankDetector::FlankDetector(std::string& ref_fasta_file, bool debug)
30 {
31     //////////////////////
32     //initialize variables
33     //////////////////////
34     this->debug = debug;
35 
36     ///////////////////
37     //initialize raHMMs
38     ///////////////////
39 //    float delta = 0.0001;
40 //    float epsilon = 0.0005;
41 //    float tau = 0.01;
42 //    float eta = 0.01;
43 //    float mismatch_penalty = 3;
44     float delta = 0.0000001;
45     float epsilon = 0.0000001;
46     float tau = 0.01;
47     float eta = 0.01;
48     float mismatch_penalty = 5;
49 
50     ahmm = new AHMM(false);
51     ahmm->set_delta(delta);
52     ahmm->set_epsilon(epsilon);
53     ahmm->set_tau(tau);
54     ahmm->set_eta(eta);
55     ahmm->set_mismatch_penalty(mismatch_penalty);
56     ahmm->initialize_T();
57 
58     lfhmm = new LFHMM(false);
59     lfhmm->set_delta(delta);
60     lfhmm->set_epsilon(epsilon);
61     lfhmm->set_tau(tau);
62     lfhmm->set_eta(eta);
63     lfhmm->set_mismatch_penalty(mismatch_penalty);
64     lfhmm->initialize_T();
65 
66     rfhmm = new RFHMM(false);
67     rfhmm->set_delta(delta);
68     rfhmm->set_epsilon(epsilon);
69     rfhmm->set_tau(tau);
70     rfhmm->set_eta(eta);
71     rfhmm->set_mismatch_penalty(mismatch_penalty);
72     rfhmm->initialize_T();
73 
74     qual.assign(1024, 'K');
75 
76     //////////////////
77     //initialize tools
78     //////////////////
79     rs = new ReferenceSequence(ref_fasta_file);
80 };
81 
82 /**
83  * Destructor.
84  */
~FlankDetector()85 FlankDetector::~FlankDetector()
86 {
87     delete rs;
88 }
89 
90 /**
91  * Detect repeat region.
92  *
93  * updates
94  * beg1
95  * end1
96  * repeat_tract
97  */
detect_flanks(Variant & variant,uint32_t mode)98 void FlankDetector::detect_flanks(Variant& variant, uint32_t mode)
99 {
100     VNTR& vntr = variant.vntr;
101 
102     //simple single base pair clipping of ends
103     if (mode==EXACT)
104     {
105         if (debug)
106         {
107             std::cerr << "********************************************\n";
108             std::cerr << "EXACT\n";
109             std:: cerr << "\n";
110         }
111 
112         if (debug)
113         {
114             std::cerr << "++++++++++++++++++++++++++++++++++++++++++++\n";
115             std::cerr << "Detect and remove anchor base\n";
116         }
117 
118         if (vntr.exact_repeat_tract.size()>2)
119         {
120             //removing the anchor bases
121             if (vntr.mlen==1)
122             {
123                 int32_t offset = 0;
124                 int32_t length = vntr.exact_repeat_tract.size();
125                 if (vntr.exact_repeat_tract.at(0)!=vntr.motif.at(0))
126                 {
127                     offset = 1;
128                     ++vntr.exact_beg1;
129                 }
130 
131                 if (vntr.exact_repeat_tract.at(vntr.exact_repeat_tract.size()-1)!=vntr.motif.at(0))
132                 {
133                     length -= offset+1;
134                     --vntr.exact_end1;
135                 }
136 
137                 vntr.exact_repeat_tract = vntr.exact_repeat_tract.substr(offset, length);
138             }
139             else
140             {
141                 //this works only for simple indels
142                 if (vntr.exact_repeat_tract.size()>=3)
143                 {
144                     vntr.exact_repeat_tract = vntr.exact_repeat_tract.substr(1, vntr.exact_repeat_tract.size()-2);
145                     ++vntr.exact_beg1;
146                     --vntr.exact_end1;
147                 }
148             }
149         }
150         //this is for nonexistent repeat units
151         // RU : T
152         // repeat_tract : G[T]C where T is an insert
153         else if (vntr.exact_repeat_tract.size()==2)
154         {
155 
156         }
157 
158         if (debug)
159         {
160             std::cerr << "\n";
161             std::cerr << "repeat_tract              : " << vntr.exact_repeat_tract << "\n";
162             std::cerr << "position                  : [" << vntr.exact_beg1 << "," << vntr.exact_end1 << "]\n";
163             std::cerr << "score                     : " << vntr.exact_score << "\n";
164             std::cerr << "trf score                 : " << vntr.exact_trf_score << "\n";
165             std::cerr << "repeat units              : " << vntr.exact_rl << "\n";
166             std::cerr << "perfect repeat units      : " << vntr.exact_no_perfect_ru << "\n";
167             std::cerr << "no. of repeat units       : " << vntr.exact_no_ru << "\n";
168             std::cerr << "\n";
169         }
170     }
171     else if (mode==FUZZY)
172     {
173         if (debug)
174         {
175             std::cerr << "********************************************\n";
176             std::cerr << "DETECTING REPEAT TRACT FUZZILY\n";
177         }
178 
179         ///////////////////////
180         //fuzzy right alignment
181         ///////////////////////
182         if (debug)
183         {
184             std::cerr << "++++++++++++++++++++++++++++++++++++++++++++\n";
185             std::cerr << "Fuzzy right alignment\n";
186         }
187 
188         int32_t slen = 100;
189 
190         std::string rflank;
191         std::string lflank;
192 
193         int32_t lflank_end1;
194         int32_t rflank_beg1;
195 
196         std::string seq;
197         int32_t seq_len;
198 
199         while (true)
200         {
201             //fetch sequences for modeling
202             rs->fetch_seq(variant.chrom, vntr.exact_end1+1, vntr.exact_end1+5, rflank);
203             rs->fetch_seq(variant.chrom, vntr.exact_end1-slen, vntr.exact_end1, seq);
204             vntr.fuzzy_ru = choose_fuzzy_3prime_repeat_unit(seq, vntr.fuzzy_motif);
205             seq.append(rflank);
206 
207             rfhmm->set_model(vntr.fuzzy_ru.c_str(), rflank.c_str());
208             rfhmm->align(seq.c_str(), qual.c_str());
209             if (debug) rfhmm->print_alignment();
210 
211             //////////////////////
212             //fuzzy left alignment
213             //////////////////////
214             if (debug)
215             {
216                 std::cerr << "\n";
217                 std::cerr << "++++++++++++++++++++++++++++++++++++++++++++\n";
218                 std::cerr << "Fuzzy left alignment\n";
219             }
220 
221             //this is a hack around rfhmm rigidity in modeling the RUs
222             //todo: we should change this to a reverse version of LFHMM!!!!
223             if (rfhmm->get_lflank_read_epos1()>std::min((int32_t)(10*vntr.fuzzy_ru.size()), 50))
224             {
225                 lflank_end1 = vntr.exact_end1-slen-1+1 + rfhmm->get_lflank_read_epos1() - 1;
226                 break;
227             }
228             else if (slen==1000)
229             {
230                 lflank_end1 = vntr.exact_end1 - 1000 - 1;
231                 vntr.is_large_repeat_tract = true;
232                 break;
233             }
234             else
235             {
236                 slen +=100;
237 
238                 if (debug)
239                     std::cerr << "extending the reference sequence for RFHMM : " << slen << "\n";
240             }
241         }
242 
243 //        slen = 100;
244 
245         //pick 5 bases to right
246         while(true)
247         {
248             //fetch sequences for modeling
249             rs->fetch_seq(variant.chrom, lflank_end1-5+1, lflank_end1, lflank);
250             rs->fetch_seq(variant.chrom, lflank_end1+1, lflank_end1+slen, seq);
251             vntr.fuzzy_ru = choose_fuzzy_5prime_repeat_unit(seq, vntr.fuzzy_motif);
252             seq =  lflank + seq;
253 
254             lfhmm->set_model(lflank.c_str(), vntr.fuzzy_ru.c_str());
255             lfhmm->align(seq.c_str(), qual.c_str());
256             if (debug) lfhmm->print_alignment();
257 
258 //            if (lfhmm->get_rflank_read_epos1()!=INT32_MAX ||
259             if (lfhmm->get_rflank_read_spos1()<slen-std::min((int32_t)(10*vntr.fuzzy_ru.size()), 50))
260             {
261                 rflank_beg1 = lflank_end1 - 5 + lfhmm->get_rflank_read_spos1();
262                 break;
263             }
264             else if (slen==1000)
265             {
266                 rflank_beg1 = lflank_end1 + 1000;
267                 vntr.is_large_repeat_tract = true;
268                 break;
269             }
270             else
271             {
272                 slen +=100;
273                 if (debug)
274                     std::cerr << "extending the reference sequence for LFHMM : " << slen << "\n";
275             }
276         }
277 
278         vntr.fuzzy_beg1 = lflank_end1+1;
279         vntr.fuzzy_end1 = rflank_beg1-1;
280         rs->fetch_seq(variant.chrom, vntr.fuzzy_beg1, vntr.fuzzy_end1, vntr.fuzzy_repeat_tract);
281     }
282 };
283 
284 /**
285  * Shifts a string.
286  */
shift_str(std::string & seq,uint32_t i)287 std::string FlankDetector::shift_str(std::string& seq, uint32_t i)
288 {
289     std::string sseq = seq;
290     if (i)
291     {
292         sseq = seq.substr(i) + seq.substr(0,i);
293     }
294 
295     return sseq;
296 }
297 
298 /**
299  * Score a string based on complete alignment.
300  * This is for the purpose of computing a motif score.
301  * It might make sense to incorporate alignment in a limited sense.
302  */
compute_score(int32_t start,int32_t len,std::string & a,std::string & b)303 int32_t FlankDetector::compute_score(int32_t start, int32_t len, std::string& a, std::string& b)
304 {
305 //    std::cerr << "a: " << a << "\n";
306 //    std::cerr << "b: " << b << "\n";
307 //    std::cerr << "start: " << start << "\n";
308 //    std::cerr << "len: " << len << "\n";
309 
310     if (len>b.size())
311     {
312         len = b.size();
313     }
314 
315     int32_t score = 0;
316     int32_t i = start;
317     int32_t j = 0;
318     while (j<len)
319     {
320         if (a.at(i)==b.at(j))
321         {
322             score += 2;
323         }
324         else
325         {
326             score -=7;
327         }
328 
329         ++i;
330         ++j;
331     }
332 
333     return score;
334 }
335 
336 /**
337  * Chooses a phase of the motif that is appropriate for the alignment from the 5 prime end.
338  * This differs from choose_exact_repeat_unit() where the motif is returned
339  * if not suitable repeat unit is found.
340  */
choose_5prime_repeat_unit(std::string & seq,std::string & motif)341 std::string FlankDetector::choose_5prime_repeat_unit(std::string& seq, std::string& motif)
342 {
343 //    std::cerr << "seq   : " << seq << "\n";
344 //    std::cerr << "motif : " << motif << "\n";
345 
346     int32_t best_score = -10000;
347     std::string best_motif = motif;
348     int32_t max_score = 2*motif.size();
349 
350     int32_t mlen = motif.size();
351     for (uint32_t i=0; i<motif.size(); ++i)
352     {
353         std::string smotif = shift_str(motif, i);
354 
355         if (seq.compare(0, mlen, smotif)==0)
356         {
357             return smotif;
358         }
359 
360         if (seq.compare(0, mlen, smotif)==0)
361         {
362             return smotif;
363         }
364 
365         std::string rc_smotif = VNTR::reverse_complement(smotif);
366         if (seq.compare(0, mlen, rc_smotif)==0)
367         {
368             return rc_smotif;
369         }
370     }
371 
372     //if no exact match, perform best fit.
373     //use a priority queue here
374 
375     return motif;
376 }
377 
378 /**
379  * Chooses a phase of the motif that is appropriate for the alignment from the 5 prime end.
380  * If no exact match is available, the best possible match is returned.
381  */
choose_fuzzy_5prime_repeat_unit(std::string & seq,std::string & motif)382 std::string FlankDetector::choose_fuzzy_5prime_repeat_unit(std::string& seq, std::string& motif)
383 {
384     int32_t best_score = -10000;
385     std::string best_motif = motif;
386     int32_t max_score = 2*motif.size();
387 
388     int32_t mlen = motif.size();
389     for (uint32_t i=0; i<motif.size(); ++i)
390     {
391         std::string smotif = shift_str(motif, i);
392         int32_t sscore = compute_score(0, mlen, seq, smotif);
393         if (sscore==max_score)
394         {
395             return smotif;
396         }
397         else if (sscore>best_score)
398         {
399             best_score = sscore;
400             best_motif = smotif;
401         }
402 
403         std::string rc_smotif = VNTR::reverse_complement(smotif);
404         int32_t rc_score = compute_score(0, mlen, seq, rc_smotif);
405         if (rc_score==max_score)
406         {
407             return rc_smotif;
408         }
409         else if (rc_score>best_score)
410         {
411             best_score = rc_score;
412             best_motif = rc_smotif;
413         }
414     }
415 
416     return best_motif;
417 }
418 
419 /**
420  * Chooses a phase of the motif that is appropriate for the alignment from the 3 prime end.
421  * This differs from choose_exact_repeat_unit() where the motif is returned
422  * if not suitable repeat unit is found.
423  */
choose_3prime_repeat_unit(std::string & seq,std::string & motif)424 std::string FlankDetector::choose_3prime_repeat_unit(std::string& seq, std::string& motif)
425 {
426     int32_t mlen = motif.size();
427     for (uint32_t i=0; i<motif.size(); ++i)
428     {
429         std::string smotif = shift_str(motif, i);
430         if (seq.compare(seq.size()-mlen, mlen, smotif)==0)
431         {
432             return smotif;
433         }
434 
435         std::string rc_smotif = VNTR::reverse_complement(smotif);
436         if (seq.compare(seq.size()-mlen, mlen, rc_smotif)==0)
437         {
438             return rc_smotif;
439         }
440     }
441 
442     return motif;
443 }
444 
445 /**
446  * Chooses a phase of the motif that is appropriate for the alignment from the 3 prime end.
447  * If no exact match is available, the best possible match is returned.
448  */
choose_fuzzy_3prime_repeat_unit(std::string & seq,std::string & motif)449 std::string FlankDetector::choose_fuzzy_3prime_repeat_unit(std::string& seq, std::string& motif)
450 {
451     int32_t best_score = -10000;
452     std::string best_motif = motif;
453     int32_t max_score = 2*motif.size();
454 
455     int32_t mlen = motif.size();
456     for (uint32_t i=0; i<motif.size(); ++i)
457     {
458         std::string smotif = shift_str(motif, i);
459         int32_t sscore = compute_score(seq.size()-mlen, mlen, seq, smotif);
460         if (sscore==max_score)
461         {
462             return smotif;
463         }
464         else if (sscore>best_score)
465         {
466             best_score = sscore;
467             best_motif = smotif;
468         }
469 
470         std::string rc_smotif = VNTR::reverse_complement(smotif);
471         int32_t rc_score = compute_score(seq.size()-mlen, mlen, seq, rc_smotif);
472         if (rc_score==max_score)
473         {
474             return rc_smotif;
475         }
476         else if (rc_score>best_score)
477         {
478             best_score = rc_score;
479             best_motif = rc_smotif;
480         }
481     }
482 
483     return best_motif;
484 }
485 
486 /**
487  * Chooses a phase of the motif that is appropriate for the alignment.
488  * This returns the empty string if the motif does not have an exact
489  * match in all its phases.
490  */
choose_exact_repeat_unit(std::string & seq,std::string & motif)491 std::string FlankDetector::choose_exact_repeat_unit(std::string& seq, std::string& motif)
492 {
493 //    if (debug)
494 //    {
495 //        std::cerr << "choose_repeat_unit\n";
496 //        std::cerr << "\tseq    " << seq << "\n";
497 //        std::cerr << "\tmotif  " << motif << "\n";
498 //    }
499 
500     for (uint32_t i=0; i<motif.size(); ++i)
501     {
502         std::string smotif = shift_str(motif, i);
503 
504         if (seq.compare(0, smotif.size(), smotif)==0)
505         {
506             return smotif;
507         }
508     }
509 
510     return "";
511 }
512 
513 /**
514  * Polish repeat tract ends.
515  */
polish_repeat_tract_ends(Variant & variant)516 void FlankDetector::polish_repeat_tract_ends(Variant& variant)
517 {
518 }
519 
520 /**
521  * Polish repeat tract ends.
522  *
523  *
524  */
polish_repeat_tract_ends(std::string & repeat_tract,std::string & motif,bool debug)525 void FlankDetector::polish_repeat_tract_ends(std::string& repeat_tract, std::string& motif, bool debug)
526 {
527     if (debug)
528     {
529         std::cerr << "===================\n";
530         std::cerr << "Polish repeat tract\n";
531         std::cerr << "===================\n";
532         std::cerr << "repeat tract : " << repeat_tract << " (" << repeat_tract.size() << ")\n";
533         std::cerr << "motif        : " << motif  << "\n";
534     }
535 
536     min_beg0 = repeat_tract.size();
537     max_end0 = -1;
538     int32_t mlen = motif.size();
539     int32_t rlen = repeat_tract.size();
540 
541     //todo:  we can use a FSA for substring matching.
542     //is there a way check all phases simultaneously?
543     for (uint32_t i = 0; i<mlen; ++i)
544     {
545         std::string smotif = shift_str(motif, i);
546 
547         int32_t temp_min_beg0 = 0;
548         int32_t temp_max_end0 = 0;
549 
550         for (int32_t j=0; j<rlen; ++j)
551         {
552             if (repeat_tract.compare(j, mlen, smotif)==0)
553             {
554                 temp_min_beg0 = j;
555                 min_beg0 = std::min(j, min_beg0);
556                 break;
557             }
558         }
559 
560         for (int32_t j=rlen-mlen; j>=0; --j)
561         {
562             if (repeat_tract.compare(j, mlen, smotif)==0)
563             {
564                 temp_max_end0 = j+mlen-1;
565                 max_end0 = std::max(j+mlen-1, max_end0);
566                 break;
567             }
568         }
569 
570         if (debug) std::cerr << "\t" << smotif  << " " << temp_min_beg0 << " " << temp_max_end0 << "\n";
571     }
572 
573     polished_repeat_tract = repeat_tract.substr(min_beg0, max_end0-min_beg0+1);
574 
575     if (debug)
576     {
577         std::cerr << "min beg      : " << min_beg0 << "\n";
578         std::cerr << "max end      : " << max_end0 << "\n";
579         std::cerr << "polished     : " << polished_repeat_tract << "\n";
580     }
581 }
582 
583 /**
584  * Computes purity score of a sequence with respect to a motif.
585  *
586  * updates
587  * 1. ru
588  * 2. score
589  * 3. trf_score
590  * 4. no_exact_ru
591  * 5. total_no_ru
592  * 6. ref
593  * 7. rl
594  * 8. ll
595  */
compute_purity_score(Variant & variant,int32_t amode)596 void FlankDetector::compute_purity_score(Variant& variant, int32_t amode)
597 {
598     VNTR& vntr = variant.vntr;
599 
600     if (amode&FINAL)
601     {
602         compute_purity_score(vntr.repeat_tract, vntr.ru);
603 
604         vntr.ru = ru;
605         vntr.score = score;
606         vntr.trf_score = trf_score;
607         vntr.no_perfect_ru = no_perfect_ru;
608         vntr.no_ru = no_ru;
609         vntr.ref = ref;
610         vntr.rl = rl;
611         vntr.ll = rl + variant.max_dlen;
612     }
613 
614     if (amode&EXACT)
615     {
616         compute_purity_score(vntr.exact_repeat_tract, vntr.exact_ru);
617 
618         vntr.exact_ru = ru;
619         vntr.exact_score = score;
620         vntr.exact_trf_score = trf_score;
621         vntr.exact_no_perfect_ru = no_perfect_ru;
622         vntr.exact_no_ru = no_ru;
623         vntr.exact_ref = ref;
624         vntr.exact_rl = rl;
625         vntr.exact_ll = rl + variant.max_dlen;
626     }
627 
628     if (amode&FUZZY)
629     {
630         compute_purity_score(vntr.fuzzy_repeat_tract, vntr.fuzzy_ru);
631 
632         vntr.fuzzy_ru = ru;
633         vntr.fuzzy_score = score;
634         vntr.fuzzy_trf_score = trf_score;
635         vntr.fuzzy_no_perfect_ru = no_perfect_ru;
636         vntr.fuzzy_no_ru = no_ru;
637         vntr.fuzzy_ref = ref;
638         vntr.fuzzy_rl = rl;
639         vntr.fuzzy_ll = rl + variant.max_dlen;
640     }
641 }
642 
643 /**
644  * Computes purity score of a sequence with respect to a motif.
645  */
compute_purity_score(std::string & repeat_tract,std::string & motif)646 void FlankDetector::compute_purity_score(std::string& repeat_tract, std::string& motif)
647 {
648     ru = choose_exact_repeat_unit(repeat_tract, motif);
649 
650     if (debug)
651     {
652         std::cerr << "ru           " << ru << "\n";
653         std::cerr << "repeat tract " << repeat_tract << "\n";
654 
655     }
656 
657     ///////////////////
658     //exact calculation
659     ///////////////////
660     if (ru!="")
661     {
662         uint32_t mlen = ru.size();
663         uint32_t j=0;
664         bool exact = true;
665         for (uint32_t i=0; i<repeat_tract.size(); ++i)
666         {
667             if (ru.at(j)!=repeat_tract.at(i))
668             {
669                 exact = false;
670                 break;
671             }
672 
673             j = (j==mlen-1) ? 0 : j+1;
674         }
675 
676         if (exact)
677         {
678             score = 1;
679             no_perfect_ru = repeat_tract.size()/motif.size();
680             no_ru = no_perfect_ru;
681             ref = (float) repeat_tract.size()/motif.size();
682             rl = repeat_tract.size();
683             trf_score = repeat_tract.size() << 1;
684             return; //done!
685         }
686     }
687 
688     ///////////////////
689     //fuzzy calculation
690     ///////////////////
691     ru = motif;
692 
693     if (ru.size()>ahmm->max_len)
694     {
695         //compute by chunks
696         //todo:: not the best way. update with a localized aligner
697 
698     }
699     else
700     {
701         ahmm->set_model(ru.c_str());
702         ahmm->align(repeat_tract.c_str(), qual.c_str());
703 
704         score = ahmm->motif_concordance;
705         score = std::round(100*score)/100;
706         no_perfect_ru = ahmm->exact_motif_count;
707         no_ru = ahmm->motif_count;
708         ref = ahmm->frac_no_repeats;
709         rl = repeat_tract.size();
710         trf_score = ahmm->trf_score;
711     }
712 }
713 
714 /**
715  * Computes composition and entropy ofrepeat tract.
716  *
717  * updates
718  * 1. comp
719  * 2. entropy
720  * 3. entropy2
721  * 4. kl_divergence
722  * 5. kl_divergence2
723  */
compute_composition_and_entropy(Variant & variant,int32_t amode)724 void FlankDetector::compute_composition_and_entropy(Variant& variant, int32_t amode)
725 {
726     VNTR& vntr = variant.vntr;
727 
728     if (amode&FINAL)
729     {
730         compute_composition_and_entropy(vntr.repeat_tract);
731         vntr.comp[0] = comp[0];
732         vntr.comp[1] = comp[1];
733         vntr.comp[2] = comp[2];
734         vntr.comp[3] = comp[3];
735 
736         vntr.entropy = entropy;
737         vntr.entropy2 = entropy2;
738         vntr.kl_divergence = kl_divergence;
739         vntr.kl_divergence2 = kl_divergence2;
740     }
741     else if (amode&EXACT)
742     {
743         compute_composition_and_entropy(vntr.exact_repeat_tract);
744         vntr.exact_comp[0] = comp[0];
745         vntr.exact_comp[1] = comp[1];
746         vntr.exact_comp[2] = comp[2];
747         vntr.exact_comp[3] = comp[3];
748 
749         vntr.exact_entropy = entropy;
750         vntr.exact_entropy2 = entropy2;
751         vntr.exact_kl_divergence = kl_divergence;
752         vntr.exact_kl_divergence2 = kl_divergence2;
753     }
754     else if (amode&FUZZY)
755     {
756         compute_composition_and_entropy(vntr.fuzzy_repeat_tract);
757         vntr.fuzzy_comp[0] = comp[0];
758         vntr.fuzzy_comp[1] = comp[1];
759         vntr.fuzzy_comp[2] = comp[2];
760         vntr.fuzzy_comp[3] = comp[3];
761         vntr.fuzzy_entropy = entropy;
762         vntr.fuzzy_entropy2 = entropy2;
763         vntr.fuzzy_kl_divergence = kl_divergence;
764         vntr.fuzzy_kl_divergence2 = kl_divergence2;
765     }
766 }
767 
768 /**
769  * Computes composition and entropy ofrepeat tract.
770  */
compute_composition_and_entropy(std::string & repeat_tract)771 void FlankDetector::compute_composition_and_entropy(std::string& repeat_tract)
772 {
773     int32_t aux_comp[4] = {0,0,0,0};
774     int32_t aux_comp2[16] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
775     int32_t b2i[10] = {0,1,0,2,0,0,0,0,0,3};
776 
777 //    ACGT x ACGT
778 //    AA - 0    = 0*0 + 0
779 //    AC - 1
780 //    AG - 2
781 //    AT - 3
782 //    CA - 4    = 1*4 + 0 = 1<<2
783 //    CC - 5
784 //    CG - 6
785 //    CT - 7
786 //    GA - 8    = 2*4 + 0  = 2<<1 = 10=>100
787 //    GC - 9
788 //    GG - 10
789 //    GT - 11
790 //    TA - 12   = 3*4 + 0  = 3<<1  11=>110=>1100
791 //    TC - 13
792 //    TG - 14
793 //    TT - 15    = 3*4+3 = 15
794 
795     int32_t n = repeat_tract.size();
796 
797     for (uint32_t i=0; i<n; ++i)
798     {
799         uint32_t b1 = b2i[(repeat_tract.at(i)-65)>>1];
800 
801         ++aux_comp[b1];
802         if (i<n-1)
803         {
804             uint32_t b2 = b2i[(repeat_tract.at(i+1)-65)>>1];
805             uint32_t bb = (b1<<2) + b2;
806             ++aux_comp2[bb];
807         }
808     }
809 
810     float p[4] = {(float) aux_comp[0]/n, (float) aux_comp[1]/n, (float) aux_comp[2]/n, (float) aux_comp[3]/n};
811 
812     entropy = 0;
813     if (p[0]) entropy += p[0] * std::log2(p[0]);
814     if (p[1]) entropy += p[1] * std::log2(p[1]);
815     if (p[2]) entropy += p[2] * std::log2(p[2]);
816     if (p[3]) entropy += p[3] * std::log2(p[3]);
817     kl_divergence = p[0] + p[1] + p[2] + p[3];
818     kl_divergence = entropy + 2*kl_divergence;
819     kl_divergence = std::round(100*kl_divergence)/100;
820     kl_divergence = fix_neg_zero(kl_divergence);
821     entropy = -entropy;
822     entropy =  std::round(100*entropy)/100;
823     entropy =  fix_neg_zero(entropy);
824 
825     comp[0] = std::round(p[0] * 100);
826     comp[1] = std::round(p[1] * 100);
827     comp[2] = std::round(p[2] * 100);
828     comp[3] = std::round(p[3] * 100);
829 
830 //    std::cerr << "tract: " << repeat_tract << "\n";
831 //    std::cerr << "A: " << comp[0] << " " << aux_comp[0] << "\n";
832 //    std::cerr << "C: " << comp[1] << " " << aux_comp[1] << "\n";
833 //    std::cerr << "G: " << comp[2] << " " << aux_comp[2] << "\n";
834 //    std::cerr << "T: " << comp[3] << " " << aux_comp[3] << "\n";
835 //    std::cerr << "\n";
836 //    std::cerr << "entropy       : " << entropy << "\n";
837 //    std::cerr << "kl_divergence : " << kl_divergence << "\n";
838 
839     entropy2 = 0;
840     kl_divergence2 = 0;
841     float log2q = -4;
842     if (n!=1)
843     {
844         float p2[16];
845         for (uint32_t i=0; i<16; ++i)
846         {
847             p2[i] = (float)aux_comp2[i]/(n-1);
848         }
849 
850         for (uint32_t i=0; i<16; ++i)
851         {
852             if (p2[i])
853             {
854                 entropy2 += p2[i]* std::log2(p2[i]);
855                 kl_divergence2 += p2[i];
856             }
857         }
858         kl_divergence2 = entropy2 + 4*kl_divergence2;
859         kl_divergence2 = std::round(100*kl_divergence2)/100;
860         kl_divergence2 = fix_neg_zero(kl_divergence2);
861         entropy2 = -entropy2;
862         entropy2 = std::round(100*entropy2)/100;
863         entropy2 = fix_neg_zero(entropy2);
864 
865 //        std::cerr << "tract: " << repeat_tract << "\n";
866 //        std::cerr << "AA: " << aux_comp2[0] << " " << p2[0] << "\n";
867 //        std::cerr << "AC: " << aux_comp2[1] << " " << p2[1] << "\n";
868 //        std::cerr << "AG: " << aux_comp2[2] << " " << p2[2] << "\n";
869 //        std::cerr << "AT: " << aux_comp2[3] << " " << p2[3]  << "\n";
870 //        std::cerr << "CA: " << aux_comp2[4] << " " << p2[4] << "\n";
871 //        std::cerr << "CC: " << aux_comp2[5] << " " << p2[5] << "\n";
872 //        std::cerr << "CG: " << aux_comp2[6] << " " << p2[6] << "\n";
873 //        std::cerr << "CT: " << aux_comp2[7] << " " << p2[7] << "\n";
874 //        std::cerr << "GA: " << aux_comp2[8] << " " << p2[8] << "\n";
875 //        std::cerr << "GC: " << aux_comp2[9] << " " << p2[9] << "\n";
876 //        std::cerr << "GG: " << aux_comp2[10] << " " << p2[10] << "\n";
877 //        std::cerr << "GT: " << aux_comp2[11] << " " << p2[11] << "\n";
878 //        std::cerr << "TA: " << aux_comp2[12] << " " << p2[12] << "\n";
879 //        std::cerr << "TC: " << aux_comp2[13] << " " << p2[13] << "\n";
880 //        std::cerr << "TG: " << aux_comp2[14] << " " << p2[14] << "\n";
881 //        std::cerr << "TT: " << aux_comp2[15] << " " << p2[15] << "\n";
882 //        std::cerr << "\n";
883 //        std::cerr << "entropy2       : " << entropy2 << "\n";
884 //        std::cerr << "kl_divergence2 : " << kl_divergence2 << "\n";
885     }
886 
887 }