1 
2 /*
3  * Copyright 2018, Chanhee Park <parkchanhee@gmail.com> and Daehwan Kim <infphilo@gmail.com>
4  *
5  * This file is part of HISAT 2.
6  *
7  * HISAT 2 is free software: you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation, either version 3 of the License, or
10  * (at your option) any later version.
11  *
12  * HISAT 2 is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with HISAT 2.  If not, see <http://www.gnu.org/licenses/>.
19  */
20 
21 #include <iostream>
22 #include <vector>
23 #include <algorithm>
24 #include <cmath>
25 #include "timer.h"
26 #include "aligner_sw.h"
27 #include "aligner_result.h"
28 #include "scoring.h"
29 #include "sstring.h"
30 
31 #include "repeat_builder.h"
32 #include "repeat_kmer.h"
33 #include "bit_packed_array.h"
34 
levenshtein_distance(const std::string & s1,const std::string & s2)35 unsigned int levenshtein_distance(const std::string& s1, const std::string& s2)
36 {
37     const std::size_t len1 = s1.size(), len2 = s2.size();
38     std::vector<unsigned int> col(len2+1), prevCol(len2+1);
39 
40     for (unsigned int i = 0; i < prevCol.size(); i++)
41         prevCol[i] = i;
42     for (unsigned int i = 0; i < len1; i++) {
43         col[0] = i+1;
44         for (unsigned int j = 0; j < len2; j++)
45             // note that std::min({arg1, arg2, arg3}) works only in C++11,
46             // for C++98 use std::min(std::min(arg1, arg2), arg3)
47             col[j+1] = std::min( std::min(prevCol[1 + j] + 1, col[j] + 1), prevCol[j] + (s1[i]==s2[j] ? 0 : 1) );
48         col.swap(prevCol);
49     }
50     return prevCol[len2];
51 }
52 
53 
hamming_distance(const string & s1,const string & s2)54 unsigned int hamming_distance(const string& s1, const string& s2)
55 {
56     assert_eq(s1.length(), s2.length());
57 
58     uint32_t cnt = 0;
59 
60     for(size_t i = 0; i < s1.length(); i++) {
61         if(s1[i] != s2[i]) {
62             cnt++;
63         }
64     }
65 
66     return cnt;
67 }
68 
reverse(const string & str)69 string reverse(const string& str)
70 {
71     string rev = str;
72     size_t str_len = str.length();
73 
74     for(size_t i = 0; i < str_len; i++) {
75         rev[i] = str[str_len - i - 1];
76     }
77 
78     return rev;
79 }
80 
reverse_complement(const string & str)81 string reverse_complement(const string& str)
82 {
83     string rev = str;
84     size_t str_len = str.length();
85 
86     for(size_t i = 0; i < str_len; i++) {
87         char nt = str[str_len - i - 1];
88         rev[i] = asc2dnacomp[nt];
89     }
90 
91     return rev;
92 }
93 
94 template<typename TStr>
getLCP(const TStr & s,CoordHelper & coordHelper,TIndexOffU a,TIndexOffU b,TIndexOffU max_len=1000)95 TIndexOffU getLCP(const TStr& s,
96                   CoordHelper& coordHelper,
97                   TIndexOffU a,
98                   TIndexOffU b,
99                   TIndexOffU max_len = 1000)
100 {
101     TIndexOffU a_end = coordHelper.getEnd(a);
102     TIndexOffU b_end = coordHelper.getEnd(b);
103 
104     assert_leq(a_end, s.length());
105     assert_leq(b_end, s.length());
106 
107     TIndexOffU k = 0;
108     while((a + k) < a_end && (b + k) < b_end && k < max_len) {
109         if(s[a + k] != s[b + k]) {
110             break;
111         }
112         k++;
113     }
114 
115     return k;
116 }
117 
118 template<typename TStr>
isSameSequenceUpto(const TStr & s,CoordHelper & coordHelper,TIndexOffU a,TIndexOffU b,TIndexOffU upto)119 bool isSameSequenceUpto(const TStr& s,
120                               CoordHelper& coordHelper,
121                               TIndexOffU a,
122                               TIndexOffU b,
123                               TIndexOffU upto)
124 {
125     TIndexOffU a_end = coordHelper.getEnd(a);
126     TIndexOffU b_end = coordHelper.getEnd(b);
127     assert_leq(a_end, s.length());
128     assert_leq(b_end, s.length());
129 
130     if(a + upto > a_end || b + upto > b_end)
131         return false;
132 
133     for(int k = upto - 1; k >= 0; k--) {
134         if(s[a + k] != s[b + k]) {
135             return false;
136         }
137     }
138 
139     return true;
140 }
141 
isSenseDominant(CoordHelper & coordHelper,const EList<TIndexOffU> & positions,size_t seed_len)142 bool isSenseDominant(CoordHelper& coordHelper,
143                      const EList<TIndexOffU>& positions,
144                      size_t seed_len)
145 {
146     // count the number of seeds on the sense strand
147     size_t sense_mer_count = 0;
148     for(size_t i = 0; i < positions.size(); i++) {
149         if(positions[i] < coordHelper.forward_length())
150             sense_mer_count++;
151     }
152 
153     // there exists two sets of seeds that are essentially the same
154     // due to sense and antisense strands except palindromic sequences
155     // choose one and skip the other one
156     assert_leq(sense_mer_count, positions.size());
157     size_t antisense_mer_count = positions.size() - sense_mer_count;
158     if(sense_mer_count < antisense_mer_count) {
159         return false;
160     } else if(sense_mer_count == antisense_mer_count) {
161         assert_geq(positions.back(), coordHelper.forward_length());
162         TIndexOffU sense_pos = coordHelper.length() - positions.back() - seed_len;
163         if(positions[0] > sense_pos) return false;
164     }
165 
166     return true;
167 }
168 
169 static const Range EMPTY_RANGE = Range(1, 0);
170 
171 struct RepeatRange {
RepeatRangeRepeatRange172 	RepeatRange() {
173         forward = true;
174     };
RepeatRangeRepeatRange175 	RepeatRange(Range r, int id) :
176 		range(r), rg_id(id), forward(true) {};
RepeatRangeRepeatRange177     RepeatRange(Range r, int id, bool fw) :
178         range(r), rg_id(id), forward(fw) {};
179 
180 	Range range;
181 	int rg_id;
182     bool forward;
183 };
184 
reverseRange(const Range & r,TIndexOffU size)185 Range reverseRange(const Range& r, TIndexOffU size)
186 {
187 	size_t len = r.second - r.first;
188 	Range rc;
189 
190 	rc.first = size - r.second;
191 	rc.second = rc.first + len;
192 
193 	return rc;
194 }
195 
reverseComplement(const string & str)196 string reverseComplement(const string& str)
197 {
198 	string rc;
199 	for(TIndexOffU si = str.length(); si > 0; si--) {
200 		rc.push_back(asc2dnacomp[str[si - 1]]);
201 	}
202 	return rc;
203 }
204 
205 
toMDZ(const EList<Edit> & edits,const string & read)206 string toMDZ(const EList<Edit>& edits, const string& read)
207 {
208     StackedAln stk;
209     BTDnaString btread;
210     BTString buf;
211 
212     btread.install(read.c_str(), true);
213 
214     stk.init(btread, edits, 0, 0, 0, 0);
215     stk.buildMdz();
216     stk.writeMdz(&buf, NULL);
217 
218     return string(buf.toZBuf());
219 }
220 
applyEdits(const string & ref,size_t read_len,const EList<Edit> & edits,const Coord & coord)221 string applyEdits(const string& ref, size_t read_len, const EList<Edit>& edits, const Coord& coord)
222 {
223     string read;
224     size_t ref_pos = coord.off();
225     size_t read_pos = 0;
226 
227     for(size_t i = 0; i < edits.size(); i++) {
228         for(; read_pos < edits[i].pos; read_pos++, ref_pos++) {
229             read.push_back(ref[ref_pos]);
230         }
231 
232         if(edits[i].type == EDIT_TYPE_READ_GAP) {
233             // delete on read
234             ref_pos++;
235         } else if(edits[i].type == EDIT_TYPE_REF_GAP) {
236             // insert on read
237             read.push_back(edits[i].qchr);
238             read_pos++;
239         } else if(edits[i].type == EDIT_TYPE_MM) {
240             assert_eq(ref[ref_pos], edits[i].chr);
241             read.push_back(edits[i].qchr);
242 
243             read_pos++;
244             ref_pos++;
245         } else {
246             assert(false);
247         }
248     }
249 
250     for(; read_pos < read_len; read_pos++, ref_pos++) {
251         read.push_back(ref[ref_pos]);
252     }
253 
254     return read;
255 }
256 
257 template<typename index_t>
compareRepeatCoordByJoinedOff(const RepeatCoord<index_t> & a,const RepeatCoord<index_t> & b)258 static bool compareRepeatCoordByJoinedOff(const RepeatCoord<index_t>& a, const RepeatCoord<index_t>& b)
259 {
260 	return a.joinedOff < b.joinedOff;
261 }
262 
263 
264 template<typename TStr>
getString(const TStr & ref,TIndexOffU start,size_t len)265 string getString(const TStr& ref, TIndexOffU start, size_t len)
266 {
267     ASSERT_ONLY(const size_t ref_len = ref.length());
268     assert_leq(start + len, ref_len);
269 
270     string s;
271 	for(size_t i = 0; i < len; i++) {
272 		char nt = "ACGT"[ref[start + i]];
273 		s.push_back(nt);
274 	}
275 
276 	return s;
277 }
278 
279 template<typename TStr>
getString(const TStr & ref,TIndexOffU start,size_t len,string & s)280 void getString(const TStr& ref, TIndexOffU start, size_t len, string& s)
281 {
282     s.clear();
283     ASSERT_ONLY(const size_t ref_len = ref.length());
284     assert_leq(start + len, ref_len);
285     for(size_t i = 0; i < len; i++) {
286         char nt = "ACGT"[ref[start + i]];
287         s.push_back(nt);
288     }
289 }
290 
291 
292 template<typename TStr>
getSequenceBase(const TStr & ref,TIndexOffU pos)293 inline uint8_t getSequenceBase(const TStr& ref, TIndexOffU pos)
294 {
295     assert_lt(pos, ref.length());
296     return (uint8_t)ref[pos];
297 }
298 
299 
300 template<typename TStr>
dump_tstr(const TStr & s)301 void dump_tstr(const TStr& s)
302 {
303 	static int print_width = 60;
304 
305 	size_t s_len = s.length();
306 
307 	for(size_t i = 0; i < s_len; i += print_width) {
308 		string buf;
309 		for(size_t j = 0; (j < print_width) && (i + j < s_len); j++) {
310 			buf.push_back("ACGTN"[s[i + j]]);
311 		}
312 		cerr << buf << endl;
313 	}
314 	cerr << endl;
315 }
316 
317 
getMaxMatchLen(const EList<Edit> & edits,const size_t read_len)318 size_t getMaxMatchLen(const EList<Edit>& edits, const size_t read_len)
319 {
320     size_t max_length = 0;
321     uint32_t last_edit_pos = 0;
322     uint32_t len = 0;
323 
324     if (edits.size() == 0) {
325         // no edits --> exact match
326         return read_len;
327     }
328 
329     for(size_t i = 0; i < edits.size(); i++) {
330         if (last_edit_pos > edits[i].pos) {
331             continue;
332         }
333 
334         len = edits[i].pos - last_edit_pos;
335         if(len > max_length) {
336             max_length = len;
337         }
338 
339         last_edit_pos = edits[i].pos + 1;
340     }
341 
342     if (last_edit_pos < read_len) {
343         len = read_len - last_edit_pos;
344         if(len > max_length) {
345             max_length = len;
346         }
347     }
348 
349     return max_length;
350 }
351 
max_index(size_t array[4])352 int max_index(size_t array[4])
353 {
354     int max_idx = 0;
355 
356     for(size_t i = 1; i < 4; i++) {
357         if(array[max_idx] < array[i]) {
358             max_idx = i;
359         }
360     }
361 
362     return max_idx;
363 }
364 
CoordHelper(TIndexOffU length,TIndexOffU forward_length,const EList<RefRecord> & szs,const EList<string> & ref_names)365 CoordHelper::CoordHelper(TIndexOffU length,
366                          TIndexOffU forward_length,
367                          const EList<RefRecord>& szs,
368                          const EList<string>& ref_names) :
369 length_(length),
370 forward_length_(forward_length),
371 szs_(szs),
372 ref_namelines_(ref_names)
373 {
374     // build ref_names_ from ref_namelines_
375     buildNames();
376     buildJoinedFragment();
377 }
378 
~CoordHelper()379 CoordHelper::~CoordHelper()
380 {
381 }
382 
buildNames()383 void CoordHelper::buildNames()
384 {
385     ref_names_.resize(ref_namelines_.size());
386     for(size_t i = 0; i < ref_namelines_.size(); i++) {
387         string& nameline = ref_namelines_[i];
388 
389         for(size_t j = 0; j < nameline.length(); j++) {
390             char n = nameline[j];
391             if(n == ' ') {
392                 break;
393             }
394             ref_names_[i].push_back(n);
395         }
396     }
397 }
398 
mapJoinedOffToSeq(TIndexOffU joinedOff)399 int CoordHelper::mapJoinedOffToSeq(TIndexOffU joinedOff)
400 {
401     /* search from cached_list */
402     if(num_cached_ > 0) {
403         for(size_t i = 0; i < num_cached_; i++) {
404             Fragments *frag = &cached_[i];
405             if(frag->contain(joinedOff)) {
406                 return frag->frag_id;
407             }
408         }
409         /* fall through */
410     }
411 
412     /* search list */
413     int top = 0;
414     int bot = fraglist_.size() - 1;
415     int pos = 0;
416 
417     Fragments *frag = &fraglist_[pos];
418     while((bot - top) > 1) {
419         pos = top + ((bot - top) >> 1);
420         frag = &fraglist_[pos];
421 
422         if (joinedOff < frag->joinedOff) {
423             bot = pos;
424         } else {
425             top = pos;
426         }
427     }
428 
429     frag = &fraglist_[top];
430     if (frag->contain(joinedOff)) {
431         // update cache
432         if (num_cached_ < CACHE_SIZE_JOINEDFRG) {
433             cached_[num_cached_] = *frag;
434             num_cached_++;
435         } else {
436             cached_[victim_] = *frag;
437             victim_++; // round-robin
438             victim_ %= CACHE_SIZE_JOINEDFRG;
439         }
440 
441         return top;
442     }
443 
444     return -1;
445 }
446 
getGenomeCoord(TIndexOffU joinedOff,string & chr_name,TIndexOffU & pos_in_chr)447 int CoordHelper::getGenomeCoord(TIndexOffU joinedOff,
448                                   string& chr_name,
449                                   TIndexOffU& pos_in_chr)
450 {
451     int seq_id = mapJoinedOffToSeq(joinedOff);
452     if (seq_id < 0) {
453         return -1;
454     }
455 
456     Fragments *frag = &fraglist_[seq_id];
457     TIndexOffU offset = joinedOff - frag->joinedOff;
458 
459     pos_in_chr = frag->seqOff + offset;
460     chr_name = ref_names_[frag->seq_id];
461 
462     return 0;
463 }
464 
buildJoinedFragment()465 void CoordHelper::buildJoinedFragment()
466 {
467     TIndexOffU acc_joinedOff = 0;
468     TIndexOffU acc_seqOff = 0;
469     TIndexOffU seq_id = 0;
470     TIndexOffU frag_id = 0;
471     for(size_t i = 0; i < szs_.size(); i++) {
472         const RefRecord& rec = szs_[i];
473         if(rec.len == 0) {
474             continue;
475         }
476         if (rec.first) {
477             acc_seqOff = 0;
478             seq_id++;
479         }
480 
481         fraglist_.expand();
482 
483         fraglist_.back().joinedOff = acc_joinedOff;
484         fraglist_.back().length = rec.len;
485 
486         fraglist_.back().frag_id = frag_id++;
487         fraglist_.back().seq_id = seq_id - 1;
488         fraglist_.back().first = rec.first;
489 
490         acc_seqOff += rec.off;
491         fraglist_.back().seqOff = acc_seqOff;
492 
493         acc_joinedOff += rec.len;
494         acc_seqOff += rec.len;
495     }
496 
497     // Add Last Fragment(empty)
498     fraglist_.expand();
499     fraglist_.back().joinedOff = acc_joinedOff;
500     fraglist_.back().length = 0;
501     fraglist_.back().seqOff = acc_seqOff;
502     fraglist_.back().first = false;
503     fraglist_.back().frag_id = frag_id;
504     fraglist_.back().seq_id = seq_id;
505 }
506 
getEnd(TIndexOffU e)507 TIndexOffU CoordHelper::getEnd(TIndexOffU e) {
508     assert_lt(e, length_)
509 
510     TIndexOffU end = 0;
511     if(e < forward_length_) {
512         int frag_id = mapJoinedOffToSeq(e);
513         assert_geq(frag_id, 0);
514         end = fraglist_[frag_id].joinedOff + fraglist_[frag_id].length;
515     } else {
516         // ReverseComplement
517         // a, b are positions w.r.t reverse complement string.
518         // fragment map is based on forward string
519         int frag_id = mapJoinedOffToSeq(length_ - e - 1);
520         assert_geq(frag_id, 0);
521         end = length_ - fraglist_[frag_id].joinedOff;
522     }
523 
524     assert_leq(end, length_);
525     return end;
526 }
527 
getStart(TIndexOffU e)528 TIndexOffU CoordHelper::getStart(TIndexOffU e) {
529     assert_lt(e, length_)
530 
531     TIndexOffU start = 0;
532     if(e < forward_length_) {
533         int frag_id = mapJoinedOffToSeq(e);
534         assert_geq(frag_id, 0);
535         start = fraglist_[frag_id].joinedOff;
536     } else {
537         // ReverseComplement
538         // a, b are positions w.r.t reverse complement string.
539         // fragment map is based on forward string
540         int frag_id = mapJoinedOffToSeq(length_ - e - 1);
541         assert_geq(frag_id, 0);
542         start = length_ - (fraglist_[frag_id].joinedOff + fraglist_[frag_id].length);
543     }
544 
545     assert_leq(start, length_);
546     return start;
547 }
548 
549 template<typename TStr>
getExtendedSeedSequence(const TStr & s,string & seq) const550 void SeedExt::getExtendedSeedSequence(const TStr& s,
551                                       string& seq) const
552 {
553     seq.clear();
554     TIndexOffU prev_end = orig_pos.first;
555     for(size_t j = 0; j < left_gaps.size(); j++) {
556         TIndexOffU curr_end = orig_pos.first - left_gaps[j].first;
557         assert_leq(curr_end, prev_end);
558         if(curr_end < prev_end) {
559             seq = getString(s, curr_end, prev_end - curr_end) + seq;
560         }
561         int gap_len = left_gaps[j].second;
562         assert_neq(gap_len, 0);
563         if(gap_len > 0) { // deletion
564             string gap_str(gap_len, '-');
565             seq = gap_str + seq;
566         } else {
567             curr_end += gap_len;
568         }
569         prev_end = curr_end;
570     }
571     assert_leq(pos.first, prev_end);
572     if(pos.first < prev_end) {
573         seq = getString(s, pos.first, prev_end - pos.first) + seq;
574     }
575     if(orig_pos.second - orig_pos.first > 0)
576         seq += getString(s, orig_pos.first, orig_pos.second - orig_pos.first);
577     TIndexOffU prev_begin = orig_pos.second;
578     for(size_t j = 0; j < right_gaps.size(); j++) {
579         TIndexOffU curr_begin = orig_pos.second + right_gaps[j].first;
580         assert_leq(prev_begin, curr_begin);
581         if(prev_begin < curr_begin) {
582             seq += getString(s, prev_begin, curr_begin - prev_begin);
583         }
584         int gap_len = right_gaps[j].second;
585         assert_neq(gap_len, 0);
586         if(gap_len > 0) { // deletion
587             string gap_str(gap_len, '-');
588             seq += gap_str;
589         } else {
590             curr_begin -= gap_len;
591         }
592         prev_begin = curr_begin;
593     }
594     assert_leq(prev_begin, pos.second);
595     if(prev_begin < pos.second) {
596         seq += getString(s, prev_begin, pos.second - prev_begin);
597     }
598 }
599 
lookup_add_SNP(EList<SeedSNP * > & repeat_snps,SeedSNP & snp)600 SeedSNP *lookup_add_SNP(EList<SeedSNP *>& repeat_snps, SeedSNP& snp)
601 {
602     for(size_t i = 0; i < repeat_snps.size(); i++) {
603         if(*repeat_snps[i] == snp) {
604             return repeat_snps[i];
605         }
606     }
607 
608     repeat_snps.expand();
609     repeat_snps.back() = new SeedSNP();
610     *(repeat_snps.back()) = snp;
611     return repeat_snps.back();
612 }
613 
614 template<typename TStr>
generateSNPs(const TStr & s,const string & consensus,EList<SeedSNP * > & repeat_snps)615 void SeedExt::generateSNPs(const TStr &s, const string& consensus, EList<SeedSNP *>& repeat_snps)
616 {
617     EList<pair<TIndexOffU, int> > gaps;
618 
619     // Merge Gaps
620     {
621         TIndexOffU left_ext_len = getLeftExtLength();
622         for(size_t g = 0; g < left_gaps.size(); g++) {
623             gaps.expand();
624             gaps.back().first = left_ext_len - left_gaps[g].first + left_gaps[g].second;
625             gaps.back().second = left_gaps[g].second;
626         }
627         TIndexOffU right_base = orig_pos.second - pos.first;
628         for(size_t g = 0; g < right_gaps.size(); g++) {
629             gaps.expand();
630             gaps.back().first = right_base + right_gaps[g].first;
631             gaps.back().second = right_gaps[g].second;
632         }
633         gaps.sort();
634     }
635 
636     //
637     string con_ref;
638     string seq_read;
639     TIndexOffU prev_con_pos = consensus_pos.first;
640     TIndexOffU prev_seq_pos = pos.first;
641 
642     for(size_t g = 0; g < gaps.size(); g++) {
643         TIndexOffU curr_seq_pos = pos.first + gaps[g].first;
644         TIndexOffU curr_con_pos = prev_con_pos + (curr_seq_pos - prev_seq_pos);
645 
646         con_ref = consensus.substr(prev_con_pos, curr_con_pos - prev_con_pos);
647         seq_read = getString(s, prev_seq_pos, curr_seq_pos - prev_seq_pos);
648         for(size_t l = 0; l < con_ref.length(); l++) {
649             if(seq_read[l] != con_ref[l]) {
650                 // Single
651                 SeedSNP snp;
652                 snp.init(EDIT_TYPE_MM, prev_con_pos + l, 1, seq_read[l]);
653 
654                 // lookup repeat_snp
655                 snps.expand();
656                 snps.back() = lookup_add_SNP(repeat_snps, snp);
657             }
658         }
659 
660         int gap_len = gaps[g].second;
661         assert_neq(gap_len, 0);
662         if(gap_len > 0) {
663             // Deletion (gap on read)
664 
665             string gap_str(gap_len, '-');
666 
667             SeedSNP snp;
668             snp.init(EDIT_TYPE_READ_GAP, curr_con_pos, gap_len, consensus.substr(curr_con_pos, gap_len));
669 
670             snps.expand();
671             snps.back() = lookup_add_SNP(repeat_snps, snp);
672 
673             curr_con_pos += gap_len;
674 
675         } else {
676             // Insertion (gap on reference)
677             string gap_str(abs(gap_len), '-');
678 
679             SeedSNP snp;
680             snp.init(EDIT_TYPE_REF_GAP, curr_con_pos, abs(gap_len), getString(s, curr_seq_pos, abs(gap_len)));
681 
682             snps.expand();
683             snps.back() = lookup_add_SNP(repeat_snps, snp);
684 
685             curr_seq_pos += abs(gap_len);
686         }
687 
688         prev_con_pos = curr_con_pos;
689         prev_seq_pos = curr_seq_pos;
690     }
691 
692     assert_eq(consensus_pos.second - prev_con_pos, pos.second - prev_seq_pos);
693     con_ref = consensus.substr(prev_con_pos, consensus_pos.second - prev_con_pos);
694     seq_read = getString(s, prev_seq_pos, pos.second - prev_seq_pos);
695 
696     for(size_t l = 0; l < con_ref.length(); l++) {
697         if(seq_read[l] != con_ref[l]) {
698             // Single
699 
700             SeedSNP snp;
701             snp.init(EDIT_TYPE_MM, prev_con_pos + l, 1, seq_read[l]);
702 
703             snps.expand();
704             snps.back() = lookup_add_SNP(repeat_snps, snp);
705         }
706     }
707 }
708 
seedCmp(const SeedExt & s,const SeedExt & s2)709 bool seedCmp(const SeedExt& s, const SeedExt& s2)
710 {
711     if(s.getLength() != s2.getLength())
712         return s.getLength() > s2.getLength();
713     if(s.pos.first != s2.pos.first)
714         return s.pos.first < s2.pos.first;
715     if(s.pos.second != s2.pos.second)
716         return s.pos.second < s2.pos.second;
717     return false;
718 }
719 
720 template<typename TStr>
init(const RepeatParameter & rp,const TStr & s,CoordHelper & coordHelper,const RB_SubSA & subSA,const RB_RepeatBase & repeatBase)721 void RB_Repeat::init(const RepeatParameter& rp,
722                      const TStr& s,
723                      CoordHelper& coordHelper,
724                      const RB_SubSA& subSA,
725                      const RB_RepeatBase& repeatBase)
726 {
727     consensus_ = repeatBase.seq;
728     assert_geq(consensus_.length(), rp.min_repeat_len);
729     assert_eq(consensus_.length(), rp.min_repeat_len + repeatBase.nodes.size() - 1);
730 
731     EList<TIndexOffU> positions, next_positions;
732 
733     const EList<TIndexOffU>& repeat_index = subSA.getRepeatIndex();
734     seeds_.clear();
735     for(size_t n = 0; n < repeatBase.nodes.size(); n++) {
736         TIndexOffU idx = repeatBase.nodes[n];
737         TIndexOffU saBegin = repeat_index[idx];
738         TIndexOffU saEnd = (idx + 1 < repeat_index.size() ? repeat_index[idx+1] : subSA.size());
739 
740         next_positions.clear();
741         for(size_t sa = saBegin; sa < saEnd; sa++) {
742             TIndexOffU left = subSA.get(sa);
743             TIndexOffU right = left + subSA.seed_len();
744             next_positions.push_back(left);
745 
746             if(left > 0) {
747                 size_t idx = positions.bsearchLoBound(left - 1);
748                 if(idx < positions.size() && positions[idx] == left - 1)
749                     continue;
750             }
751 
752             seeds_.expand();
753             SeedExt& seed = seeds_.back();
754             seed.reset();
755             seed.orig_pos = pair<TIndexOffU, TIndexOffU>(left, right);
756             seed.pos = seed.orig_pos;
757             seed.consensus_pos.first = n;
758             seed.consensus_pos.second = n + rp.min_repeat_len;
759             seed.bound = pair<TIndexOffU, TIndexOffU>(coordHelper.getStart(left), coordHelper.getEnd(left));
760 
761 #ifndef NDEBUG
762             string tmp_str = getString(s, seed.pos.first, seed.pos.second - seed.pos.first);
763             assert_eq(tmp_str, consensus_.substr(n, seed.pos.second - seed.pos.first));
764 #endif
765 
766             for(size_t p = seed.consensus_pos.second; p < consensus_.length(); p++) {
767                 size_t pos = seed.pos.second;
768                 if(pos >= seed.bound.second)
769                     break;
770                 int nt = getSequenceBase(s, pos);
771                 if("ACGT"[nt] != consensus_[p])
772                     break;
773                 seed.pos.second++;
774                 seed.consensus_pos.second++;
775             }
776         }
777         positions = next_positions;
778     }
779 
780     internal_update();
781 }
782 
783 template<typename TStr>
extendConsensus(const RepeatParameter & rp,const TStr & s)784 void RB_Repeat::extendConsensus(const RepeatParameter& rp,
785                                 const TStr& s)
786 {
787     size_t remain = seeds_.size();
788     EList<string> left_consensuses, right_consensuses, empty_consensuses;
789     EList<size_t> ed_seed_nums;
790 
791     const TIndexOffU default_max_ext_len = (rp.max_repeat_len - rp.seed_len) / 2;
792     const TIndexOffU seed_mm = 1;
793     string left_ext_consensus = "", right_ext_consensus = "";
794 
795     empty_consensuses.resize(seed_mm + 1);
796     empty_consensuses.fill("");
797 
798     while(remain >= rp.repeat_count) {
799         for(size_t i = 0; i < remain; i++) {
800             seeds_[i].done = false;
801             seeds_[i].curr_ext_len = 0;
802         }
803 
804         // extend seeds in left or right direction
805         TIndexOffU max_ext_len = min(default_max_ext_len, (TIndexOffU)(rp.max_repeat_len - consensus_.length()));
806         get_consensus_seq(s,
807                           seeds_,
808                           0,      // seed begin
809                           remain, // seed end
810                           max_ext_len,
811                           max_ext_len,
812                           seed_mm,
813                           rp,
814                           ed_seed_nums,
815                           &left_consensuses,
816                           &right_consensuses);
817 
818         size_t allowed_seed_mm = 0;
819         left_ext_consensus.clear();
820         right_ext_consensus.clear();
821         for(int i = 0 /* (int)seed_mm */; i >= 0; i--) {
822             size_t extlen = (ed_seed_nums[i] < rp.repeat_count ? 0 : (left_consensuses[i].length() + right_consensuses[i].length()));
823             // if(extlen <= 0 || extlen < max_ext_len * i / seed_mm)
824             //if(extlen < max_ext_len * 2)
825             //    continue;
826             if(extlen <= 0)
827                 continue;
828 
829             left_ext_consensus = left_consensuses[i];
830             right_ext_consensus = right_consensuses[i];
831             allowed_seed_mm = (size_t)i;
832             assert_gt(left_ext_consensus.length(), 0);
833             assert_gt(right_ext_consensus.length(), 0);
834             break;
835         }
836         size_t num_passed_seeds = 0;
837         if(left_ext_consensus.length() > 0 && right_ext_consensus.length()) {
838             consensus_ = reverse(left_ext_consensus) + consensus_;
839             consensus_ += right_ext_consensus;
840 
841 #if 0
842             calc_edit_dist(s,
843                            seeds_,
844                            0,
845                            remain,
846                            left ? consensuses : empty_consensuses,
847                            left ? empty_consensuses : consensuses,
848                            allowed_seed_mm);
849 #endif
850 
851             // update seeds
852             for(size_t i = 0; i < seeds_.size(); i++) {
853                 SeedExt& seed = seeds_[i];
854 
855                 if(i < remain) {
856                     if(seed.ed <= allowed_seed_mm) {
857                         num_passed_seeds++;
858                         seed.done = true;
859                         seed.total_ed += seed.ed;
860                         seed.pos.first -= left_ext_consensus.length();
861                         seed.pos.second += right_ext_consensus.length();
862                         seed.consensus_pos.first = 0;
863                         seed.consensus_pos.second = consensus_.length();
864 
865                         if(seed.left_gaps.size() > 0 &&
866                            seed.left_gaps.back().first >= seed.orig_pos.first - seed.pos.first) {
867                             int gap_len = seed.left_gaps.back().second;
868                             seed.pos.first += gap_len;
869                         }
870                         if(seed.right_gaps.size() > 0 &&
871                            seed.right_gaps.back().first >= seed.pos.second - seed.orig_pos.second) {
872                             int gap_len = seed.right_gaps.back().second;
873                             seed.pos.second -= gap_len;
874                         }
875                     }
876                 }
877             }
878 
879             // move up "done" seeds
880             size_t j = 0;
881             for(size_t i = 0; i < remain; i++) {
882                 if(!seeds_[i].done) continue;
883                 assert_geq(i, j);
884                 if(i > j) {
885                     SeedExt temp = seeds_[j];
886                     seeds_[j] = seeds_[i];
887                     seeds_[i] = temp;
888                     // Find next "undone" seed
889                     j++;
890                     while(j < i && seeds_[j].done) {
891                         j++;
892                     }
893                     assert(j < remain && !seeds_[j].done);
894                 } else {
895                     j = i + 1;
896                 }
897             }
898         }
899 
900         remain = num_passed_seeds;
901         break;
902     } // while(remain >= rp.repeat_count)
903 
904 #ifndef NDEBUG
905     // make sure seed positions are unique
906     EList<pair<TIndexOffU, TIndexOffU> > seed_poses;
907     for(size_t i = 0; i < seeds_.size(); i++) {
908         seed_poses.expand();
909         seed_poses.back() = seeds_[i].orig_pos;
910     }
911     seed_poses.sort();
912     for(size_t i = 0; i + 1 < seed_poses.size(); i++) {
913         if(seed_poses[i].first == seed_poses[i+1].first) {
914             assert_lt(seed_poses[i].second, seed_poses[i+1].second);
915         } else {
916             assert_lt(seed_poses[i].first, seed_poses[i+1].first);
917         }
918     }
919 
920     for(size_t i = 0; i < seeds_.size(); i++) {
921         assert(seeds_[i].valid());
922     }
923 #endif
924 
925     RB_Repeat::internal_update();
926 
927     // check repeats within a repeat sequence
928     self_repeat_ = false;
929     for(size_t i = 0; i + 1 < seed_ranges_.size(); i++) {
930         const RB_AlleleCoord& range = seed_ranges_[i];
931         for(size_t j = i + 1; j < seed_ranges_.size(); j++) {
932             RB_AlleleCoord& range2 = seed_ranges_[j];
933             if(range.right <= range2.left)
934                 break;
935             self_repeat_  = true;
936         }
937     }
938 }
939 
940 template<typename TStr>
getNextRepeat(const RepeatParameter & rp,const TStr & s,RB_Repeat & o)941 void RB_Repeat::getNextRepeat(const RepeatParameter& rp,
942                               const TStr& s,
943                               RB_Repeat& o)
944 {
945     o.reset();
946 
947     size_t i = 0;
948     for(i = 0; i < seeds_.size() && seeds_[i].done; i++);
949     if(i >= seeds_.size())
950         return;
951 
952     for(size_t j = i; j < seeds_.size(); j++) {
953         assert(!seeds_[j].done);
954         o.seeds_.expand();
955         o.seeds_.back() = seeds_[j];
956 
957     }
958     seeds_.resizeExact(i);
959     internal_update();
960 
961     assert_gt(o.seeds_.size(), 0);
962     o.consensus_ = getString(s, o.seeds_[0].orig_pos.first, o.seeds_[0].orig_pos.second - o.seeds_[0].orig_pos.first);
963     for(size_t j = 0; j < o.seeds_.size(); j++) {
964         o.seeds_[j].pos = o.seeds_[j].orig_pos;
965         o.seeds_[j].left_gaps.clear();
966         o.seeds_[j].right_gaps.clear();
967         o.seeds_[j].ed = o.seeds_[j].total_ed = 0;
968     }
969 }
970 
internal_update()971 void RB_Repeat::internal_update()
972 {
973     sort(seeds_.begin(), seeds_.end(), seedCmp);
974 
975     seed_ranges_.resizeExact(seeds_.size());
976     for(size_t i = 0; i < seeds_.size(); i++) {
977         seed_ranges_[i].left = seeds_[i].pos.first;
978         seed_ranges_[i].right = seeds_[i].pos.second;
979         seed_ranges_[i].idx = i;
980     }
981     seed_ranges_.sort();
982 
983     // check repeats within a repeat sequence
984     size_t remove_count = 0;
985     for(size_t i = 0; i + 1 < seed_ranges_.size(); i++) {
986         const RB_AlleleCoord& range = seed_ranges_[i];
987         if(range.left == numeric_limits<size_t>::max())
988             continue;
989         for(size_t j = i + 1; j < seed_ranges_.size(); j++) {
990             RB_AlleleCoord& range2 = seed_ranges_[j];
991             if(range2.left == numeric_limits<size_t>::max())
992                 continue;
993             if(range.right <= range2.left)
994                 break;
995             if(range.right >= range2.right) {
996                 range2.left = numeric_limits<size_t>::max();
997                 remove_count++;
998             }
999         }
1000     }
1001 
1002     if(remove_count <= 0)
1003         return;
1004 
1005     for(size_t i = 0; i < seed_ranges_.size(); i++) {
1006         if(seed_ranges_[i].left == numeric_limits<size_t>::max())
1007             seeds_[seed_ranges_[i].idx].reset();
1008     }
1009     sort(seeds_.begin(), seeds_.end(), seedCmp);
1010 
1011 #ifndef NDEBUG
1012     for(size_t i = 0; i < seeds_.size(); i++) {
1013         if(i < seeds_.size() - remove_count) {
1014             assert_lt(seeds_[i].pos.first, seeds_[i].pos.second);
1015         } else {
1016             assert_eq(seeds_[i].pos.first, seeds_[i].pos.second);
1017         }
1018     }
1019 #endif
1020 
1021     seeds_.resize(seeds_.size() - remove_count);
1022     seed_ranges_.resize(seeds_.size());
1023     for(size_t i = 0; i < seeds_.size(); i++) {
1024         seed_ranges_[i].left = seeds_[i].pos.first;
1025         seed_ranges_[i].right = seeds_[i].pos.second;
1026         seed_ranges_[i].idx = i;
1027     }
1028     seed_ranges_.sort();
1029 }
1030 
contain(TIndexOffU left,TIndexOffU right) const1031 bool RB_Repeat::contain(TIndexOffU left, TIndexOffU right) const
1032 {
1033     size_t l = 0, r = seed_ranges_.size();
1034     while(l < r) {
1035         size_t m = (l + r) / 2;
1036         const RB_AlleleCoord& coord = seed_ranges_[m];
1037         if(right <= coord.left) {
1038             r = m;
1039         } else if(left >= coord.right) {
1040             l = m + 1;
1041         } else {
1042             return coord.left <= left && right <= coord.right;
1043         }
1044     }
1045 
1046     return false;
1047 }
1048 
1049 template<typename TStr>
saveSeedExtension(const RepeatParameter & rp,const TStr & s,CoordHelper & coordHelper,TIndexOffU grp_id,ostream & fp,size_t & total_repeat_seq_len,size_t & total_allele_seq_len) const1050 void RB_Repeat::saveSeedExtension(const RepeatParameter& rp,
1051                                   const TStr& s,
1052                                   CoordHelper& coordHelper,
1053                                   TIndexOffU grp_id,
1054                                   ostream& fp,
1055                                   size_t& total_repeat_seq_len,
1056                                   size_t& total_allele_seq_len) const
1057 {
1058     // apply color, which is compatible with linux commands such as cat and less -r
1059 #if 1
1060     const string red = "\033[31m", reset = "\033[0m", resetline = "\033[4m", redline = "\033[4;31m";
1061 #else
1062     const string red = "", reset = "", resetline = "", redline = "";
1063 #endif
1064     bool show_exterior_seq = true;
1065     const size_t max_show_seq_len = 700;
1066 
1067     size_t total_count = 0;
1068     for(size_t i = 0; i < seeds_.size(); i++) {
1069         const SeedExt& seed = seeds_[i];
1070         size_t ext_len = seed.getLength();
1071         if(ext_len < rp.min_repeat_len) continue;
1072         total_allele_seq_len += ext_len;
1073         total_count++;
1074         bool sense_strand = seed.pos.first < coordHelper.forward_length();
1075 
1076         fp << setw(6) << repeat_id_ << "  " << setw(5) << seeds_.size();
1077         fp << "  " << setw(4) << i;
1078         fp << "  " << setw(4) << ext_len;
1079         fp << "  " << (sense_strand ? '+' : '-');
1080         fp << "  " << setw(10) << seed.pos.first << "  " << setw(10) << seed.pos.second;
1081         fp << "  " << setw(10) << seed.orig_pos.first << "  " << setw(10) << seed.orig_pos.second;
1082 
1083         string chr_name;
1084         TIndexOffU pos_in_chr;
1085         if(sense_strand) {
1086             coordHelper.getGenomeCoord(seed.pos.first, chr_name, pos_in_chr);
1087         } else {
1088             coordHelper.getGenomeCoord(s.length() - seed.pos.first - (seed.pos.second - seed.pos.first), chr_name, pos_in_chr);
1089         }
1090         fp << "  " << setw(5) << chr_name << ":" << setw(10) << std::left << pos_in_chr << std::right;
1091 
1092         if(sense_strand) {
1093             coordHelper.getGenomeCoord(seed.pos.second, chr_name, pos_in_chr);
1094         } else {
1095             coordHelper.getGenomeCoord(s.length() - seed.pos.second - (seed.pos.second - seed.pos.first), chr_name, pos_in_chr);
1096         }
1097         fp << "  " << setw(5) << chr_name << ":" << setw(10) << std::left << pos_in_chr << std::right;
1098 
1099         if(!seed.aligned) {
1100             fp << endl;
1101             continue;
1102         }
1103 
1104         string deststr = "";
1105         seed.getExtendedSeedSequence(s, deststr);
1106 
1107         // add exterior sequences
1108         if(seed.consensus_pos.first > 0) {
1109             TIndexOffU seq_pos, seq_len;
1110             if(seed.pos.first >= seed.consensus_pos.first) {
1111                 seq_pos = seed.pos.first - seed.consensus_pos.first;
1112                 seq_len = seed.consensus_pos.first;
1113             } else {
1114                 seq_pos = 0;
1115                 seq_len = seed.pos.first;
1116             }
1117             deststr = getString(s, seq_pos, seq_len) + deststr;
1118             if(seq_len < seed.consensus_pos.first) {
1119                 deststr = string(seed.consensus_pos.first - seq_len, 'N') + deststr;
1120             }
1121         }
1122         if(seed.consensus_pos.second < consensus_.length()) {
1123             deststr += getString(s, seed.pos.second, consensus_.length() - seed.consensus_pos.second);
1124         }
1125 
1126         assert_eq(consensus_.length(), deststr.length());
1127         fp << "  ";
1128 
1129         // print sequence w.r.t. the current group
1130         for(size_t j = 0; j < min(consensus_.length(), max_show_seq_len); j++) {
1131             bool outside = j < seed.consensus_pos.first || j >= seed.consensus_pos.second;
1132             bool different = (consensus_[j] != deststr[j]);
1133             if(outside) {
1134                 if(show_exterior_seq) {
1135                     if(different) {
1136                         fp << redline;
1137                         fp << deststr[j];
1138                         fp << reset;
1139                     } else {
1140                         fp << resetline;
1141                         fp << deststr[j];
1142                         fp << reset;
1143                     }
1144                 } else {
1145                     if(j < seed.consensus_pos.first)
1146                         fp << " ";
1147                 }
1148             } else {
1149                 if(different) fp << red;
1150                 fp << deststr[j];
1151                 if(different) fp << reset;
1152             }
1153         }
1154 
1155 #if 0
1156         fp << "\t";
1157         for(size_t ei = 0; ei < seed.edits.size(); ei++) {
1158             const Edit& edit = seed.edits[ei];
1159             if(ei > 0) fp << ",";
1160             fp << edit;
1161             if (edit.snpID != std::numeric_limits<uint32_t>::max()) {
1162                 fp << "@" << edit.snpID;
1163             }
1164         }
1165 #endif
1166 
1167         fp << endl;
1168     }
1169 
1170     if(total_count > 0) fp << setw(5) << total_count << endl << endl;
1171     total_repeat_seq_len += consensus_.length();
1172 }
1173 
1174 template<typename TStr>
calc_edit_dist(const TStr & s,const SeedExt & seed,const SeedExt & seed2,size_t left_ext,size_t right_ext,size_t max_ed)1175 size_t calc_edit_dist(const TStr& s,
1176                       const SeedExt& seed,
1177                       const SeedExt& seed2,
1178                       size_t left_ext,
1179                       size_t right_ext,
1180                       size_t max_ed)
1181 {
1182     if(seed.bound.first + left_ext > seed.pos.first ||
1183        seed.pos.second + right_ext > seed.bound.second ||
1184        seed2.bound.first + left_ext > seed2.pos.first ||
1185        seed2.pos.second + right_ext > seed2.bound.second)
1186         return max_ed + 1;
1187 
1188     size_t ed = 0;
1189     for(size_t i = 0; i < left_ext; i++) {
1190         int ch = getSequenceBase(s, seed.pos.first - i - 1);
1191         assert_range(0, 3, ch);
1192         int ch2 = getSequenceBase(s, seed2.pos.first - i - 1);
1193         assert_range(0, 3, ch2);
1194         if(ch != ch2) ed++;
1195         if(ed > max_ed)
1196             return ed;
1197 
1198     }
1199     for(size_t i = 0; i < right_ext; i++) {
1200         int ch = getSequenceBase(s, seed.pos.second + i);
1201         assert_range(0, 3, ch);
1202         int ch2 = getSequenceBase(s, seed2.pos.second + i);
1203         assert_range(0, 3, ch2);
1204         if(ch != ch2) ed++;
1205         if(ed > max_ed)
1206             return ed;
1207     }
1208 
1209     return ed;
1210 }
1211 
extract_kmer(const string & seq,size_t offset,size_t k=5)1212 size_t extract_kmer(const string& seq, size_t offset, size_t k = 5)
1213 {
1214     assert_leq(offset + k, seq.length());
1215     size_t kmer = 0;
1216     for(size_t i = 0; i < k; i++) {
1217         kmer = (kmer << 2) | asc2dna[seq[offset + i]];
1218     }
1219     return kmer;
1220 }
1221 
next_kmer(size_t kmer,char base,size_t k=5)1222 size_t next_kmer(size_t kmer, char base, size_t k = 5)
1223 {
1224     kmer &= ((1 << ((k-1)*2))) - 1;
1225     kmer = (kmer << 2) | asc2dna[base];
1226     return kmer;
1227 }
1228 
build_kmer_table(const string & consensus,EList<pair<size_t,size_t>> & kmer_table,size_t k=5)1229 void build_kmer_table(const string& consensus, EList<pair<size_t, size_t> >& kmer_table, size_t k = 5)
1230 {
1231     kmer_table.clear();
1232     if(consensus.length() < k)
1233         return;
1234     size_t kmer = 0;
1235     for(size_t i = 0; i + k <= consensus.length(); i++) {
1236         if(i == 0) {
1237             kmer = extract_kmer(consensus, i, k);
1238         } else {
1239             kmer = next_kmer(kmer, consensus[i+k-1], k);
1240         }
1241         kmer_table.expand();
1242         kmer_table.back().first = kmer;
1243         kmer_table.back().second = i;
1244     }
1245     kmer_table.sort();
1246 }
1247 
find_gap_pos(const string & s,const string & s2,EList<size_t> & ed,EList<size_t> & ed2,bool del,size_t gap_len,size_t max_mm,size_t & gap_pos,size_t & mm,bool debug=false)1248 void find_gap_pos(const string& s,
1249                   const string& s2,
1250                   EList<size_t>& ed,
1251                   EList<size_t>& ed2,
1252                   bool del,
1253                   size_t gap_len,
1254                   size_t max_mm,
1255                   size_t& gap_pos,
1256                   size_t& mm,
1257                   bool debug = false)
1258 {
1259     assert_eq(s.length(), s2.length());
1260     size_t seq_len = s.length();
1261     ed.resizeExact(seq_len); ed.fill(max_mm + 1);
1262     ed2.resizeExact(seq_len); ed2.fill(max_mm + 1);
1263 
1264     // from left to right
1265     for(size_t i = 0; i < seq_len; i++) {
1266         size_t add = (s[i] == s2[i] ? 0 : 1);
1267         if(i == 0) ed[i] = add;
1268         else       ed[i] = ed[i-1] + add;
1269         if(ed[i] >= max_mm + 1) break;
1270     }
1271 
1272     // from right to left
1273     size_t s_sub  = del ? 0 : gap_len;
1274     size_t s2_sub = del ? gap_len : 0;
1275     for(int i = seq_len - 1; i >= gap_len; i--) {
1276         size_t add = (s[i - s_sub] == s2[i - s2_sub] ? 0 : 1);
1277         if(i == seq_len - 1) ed2[i] = add;
1278         else                 ed2[i] = ed2[i+1] + add;
1279         if(ed2[i] > max_mm) break;
1280     }
1281 
1282     if(debug) {
1283         cout << s << endl << s2 << endl;
1284         for(size_t i = 0; i < ed.size(); i++) {
1285             cout << (ed[i] % 10);
1286         }
1287         cout << endl;
1288         for(size_t i = 0; i < ed2.size(); i++) {
1289             cout << (ed2[i] % 10);
1290         }
1291         cout << endl;
1292     }
1293 
1294     size_t min_mm = ed2[gap_len];
1295     int min_mm_i = -1;
1296     assert_eq(ed.size(), ed2.size());
1297     for(size_t i = 0; i + gap_len + 1 < ed.size(); i++) {
1298         if(ed[i] > max_mm) break;
1299         size_t cur_mm = ed[i] + ed2[i + gap_len + 1];
1300         if(cur_mm < min_mm) {
1301             min_mm = cur_mm;
1302             min_mm_i = i;
1303         }
1304     }
1305 
1306     mm = min_mm;
1307     if(mm > max_mm)
1308         return;
1309     gap_pos = (size_t)(min_mm_i + 1);
1310 }
1311 
align_with_one_gap(const string & s,const EList<pair<size_t,size_t>> & s_kmer_table,const string & s2,EList<size_t> & counts,size_t max_gap,size_t max_mm,size_t & mm,size_t & gap_pos,int & gap_len,size_t k=5)1312 void align_with_one_gap(const string& s,
1313                         const EList<pair<size_t, size_t> >& s_kmer_table,
1314                         const string& s2,
1315                         EList<size_t>& counts,
1316                         size_t max_gap,
1317                         size_t max_mm,
1318                         size_t& mm,
1319                         size_t& gap_pos,
1320                         int& gap_len,
1321                         size_t k = 5)
1322 {
1323     mm = max_mm + 1;
1324     assert_eq(s.length(), s2.length());
1325     counts.resizeExact(max_gap * 2 + 1);
1326     counts.fillZero();
1327     size_t max_count = 0, max_count_i = 0;
1328     for(size_t i = 0; i + k <= s2.length(); i += 1) {
1329         pair<size_t, size_t> kmer(0, 0);
1330         kmer.first = extract_kmer(s2, i, k);
1331         size_t lb = s_kmer_table.bsearchLoBound(kmer);
1332         while(lb < s_kmer_table.size() && kmer.first == s_kmer_table[lb].first) {
1333             int gap = (int)s_kmer_table[lb].second - (int)i;
1334             if(gap != 0 && abs(gap) < max_gap) {
1335                 size_t gap_i = (size_t)(gap + max_gap);
1336                 counts[gap_i]++;
1337                 if(counts[gap_i] > max_count) {
1338                     max_count = counts[gap_i];
1339                     max_count_i = gap_i;
1340                 }
1341             }
1342             lb++;
1343         }
1344     }
1345 
1346     if(max_count <= 0)
1347         return;
1348 
1349     assert_lt(max_count_i, counts.size());
1350     int gap = (int)max_count_i - (int)max_gap;
1351     assert_neq(gap, 0);
1352     size_t abs_gap = abs(gap);
1353     bool del = gap > 0;
1354 
1355     EList<size_t> ed, ed2;
1356     find_gap_pos(s,
1357                  s2,
1358                  ed,
1359                  ed2,
1360                  del,
1361                  abs_gap,
1362                  max_mm,
1363                  gap_pos,
1364                  mm);
1365     if(mm > max_mm)
1366         return;
1367 
1368     gap_len = del ? abs_gap : -abs_gap;
1369 
1370 #ifndef NDEBUG
1371     assert_leq(mm, max_mm);
1372     string ds = s, ds2 = s2;
1373     size_t debug_mm = 0;
1374     if(del) ds.erase(gap_pos, abs_gap);
1375     else    ds2.erase(gap_pos, abs_gap);
1376 
1377     for(size_t i = 0; i < min(ds.length(), ds2.length()); i++) {
1378         if(ds[i] != ds2[i])
1379             debug_mm++;
1380     }
1381     assert_eq(debug_mm, mm);
1382 #endif
1383 }
1384 
1385 template<typename TStr>
calc_edit_dist(const TStr & s,EList<SeedExt> & seeds,size_t sb,size_t se,const EList<string> & left_consensuses,const EList<string> & right_consensuses,uint32_t max_ed)1386 void calc_edit_dist(const TStr& s,
1387                     EList<SeedExt>& seeds,
1388                     size_t sb,
1389                     size_t se,
1390                     const EList<string>& left_consensuses,
1391                     const EList<string>& right_consensuses,
1392                     uint32_t max_ed)
1393 {
1394     string left_consensus = left_consensuses[max_ed];
1395     string right_consensus = right_consensuses[max_ed];
1396 
1397     EList<pair<size_t, size_t> > left_kmer_table, right_kmer_table;
1398     build_kmer_table(left_consensus, left_kmer_table);
1399     build_kmer_table(right_consensus, right_kmer_table);
1400 
1401     string left_seq, right_seq;
1402     const size_t max_gap = 10;
1403     EList<size_t> counts;
1404 
1405     size_t left_ext = left_consensus.length();
1406     size_t right_ext = right_consensus.length();
1407     for(size_t i = sb; i < se; i++) {
1408         SeedExt& seed = seeds[i];
1409         if(seed.bound.first + left_ext > seed.pos.first ||
1410            seed.pos.second + right_ext > seed.bound.second) {
1411             seed.ed = max_ed + 1;
1412             continue;
1413         }
1414 
1415         size_t left_ed = 0;
1416         if(left_ext > 0) {
1417             getString(s, seed.pos.first - left_ext, left_ext, left_seq);
1418             reverse(left_seq.begin(), left_seq.end());
1419             for(size_t j = 0; j < left_ext; j++) {
1420                 if(left_seq[j] != left_consensus[j]) left_ed++;
1421                 if(left_ed <= max_ed && j < left_consensuses[left_ed].length()) {
1422                     seed.curr_ext_len = j + 1;
1423                 }
1424             }
1425 
1426             if(left_ed > max_ed) {
1427                 size_t gap_pos = 0;
1428                 int gap_len = 0;
1429                 align_with_one_gap(left_consensus,
1430                                    left_kmer_table,
1431                                    left_seq,
1432                                    counts,
1433                                    min(left_ed - max_ed, max_gap),
1434                                    max_ed,
1435                                    left_ed,
1436                                    gap_pos,
1437                                    gap_len);
1438                 if(left_ed <= max_ed) {
1439                     seed.left_gaps.expand();
1440                     seed.left_gaps.back().first = seed.getLeftExtLength() + gap_pos;
1441                     seed.left_gaps.back().second = gap_len;
1442                 }
1443             }
1444         } else {
1445             left_seq.clear();
1446         }
1447 
1448         size_t right_ed = 0;
1449         if(right_ext > 0) {
1450             getString(s, seed.pos.second, right_ext, right_seq);
1451             for(size_t j = 0; j < right_ext; j++) {
1452                 if(right_seq[j] != right_consensus[j]) right_ed++;
1453                 if(right_ed <= max_ed && j < right_consensuses[right_ed].length()) {
1454                     seed.curr_ext_len = j + 1;
1455                 }
1456             }
1457             if(right_ed > max_ed) {
1458                 size_t gap_pos = 0;
1459                 int gap_len = 0;
1460                 align_with_one_gap(right_consensus,
1461                                    right_kmer_table,
1462                                    right_seq,
1463                                    counts,
1464                                    min(right_ed - max_ed, max_gap),
1465                                    max_ed,
1466                                    right_ed,
1467                                    gap_pos,
1468                                    gap_len);
1469                 if(right_ed <= max_ed) {
1470                     seed.right_gaps.expand();
1471                     seed.right_gaps.back().first = seed.getRightExtLength() + gap_pos;
1472                     seed.right_gaps.back().second = gap_len;
1473                 }
1474             }
1475         } else {
1476             right_seq.clear();
1477         }
1478 
1479         seed.ed = left_ed + right_ed;
1480     }
1481 }
1482 
1483 template<typename TStr>
get_consensus_seq(const TStr & s,EList<SeedExt> & seeds,size_t sb,size_t se,size_t min_left_ext,size_t min_right_ext,size_t max_ed,const RepeatParameter & rp,EList<size_t> & ed_seed_nums,EList<string> * left_consensuses,EList<string> * right_consensuses) const1484 void RB_Repeat::get_consensus_seq(const TStr& s,
1485                                   EList<SeedExt>& seeds,
1486                                   size_t sb,
1487                                   size_t se,
1488                                   size_t min_left_ext,
1489                                   size_t min_right_ext,
1490                                   size_t max_ed,
1491                                   const RepeatParameter& rp,
1492                                   EList<size_t>& ed_seed_nums,
1493                                   EList<string>* left_consensuses,
1494                                   EList<string>* right_consensuses) const
1495 {
1496     assert_lt(sb, se);
1497     assert_geq(se - sb, rp.seed_count);
1498     assert_leq(se, seeds.size());
1499     if(left_consensuses != NULL) {
1500         left_consensuses->clear(); left_consensuses->resize(max_ed + 1);
1501         for(size_t i = 0; i < max_ed + 1; i++) {
1502             (*left_consensuses)[i].clear();
1503         }
1504     }
1505     if(right_consensuses != NULL) {
1506         right_consensuses->clear(); right_consensuses->resize(max_ed + 1);
1507         for(size_t i = 0; i < max_ed + 1; i++) {
1508             (*right_consensuses)[i].clear();
1509         }
1510     }
1511 
1512     assert_eq(min_left_ext, min_right_ext);
1513 
1514     EList<string> seqs;
1515     seqs.reserveExact(seeds.size());
1516     size_t max_i = 0, max_count = 0;
1517     while(min_left_ext > 0) {
1518         seqs.clear();
1519         max_i = 0;
1520         max_count = 0;
1521         for(size_t i = sb; i < se; i++) {
1522             const SeedExt& seed = seeds[i];
1523             if(seeds[i].bound.first + min_left_ext > seed.pos.first ||
1524                seed.pos.second + min_right_ext > seed.bound.second)
1525                 continue;
1526 
1527             seqs.expand();
1528             if(min_left_ext > 0) {
1529                 getString(s, seed.pos.first - min_left_ext, min_left_ext, seqs.back());
1530             }
1531             if(min_right_ext > 0) {
1532                 seqs.back() += getString(s, seed.pos.second, min_right_ext);
1533             }
1534         }
1535         seqs.sort();
1536         for(size_t i = 0; i + max_count < seqs.size();) {
1537             size_t count = 1;
1538             for(size_t j = i + 1; j < seqs.size(); j++) {
1539                 if(seqs[i] == seqs[j]) count++;
1540                 else                   break;
1541             }
1542             if(count >= max_count) {
1543                 max_count = count;
1544                 max_i = i;
1545             }
1546             i = i + count;
1547         }
1548         if(max_count >= rp.seed_count)
1549             break;
1550 
1551         min_left_ext--;
1552         min_right_ext--;
1553     }
1554 
1555     // update ed
1556     // extend consensus string
1557     ed_seed_nums.resize(max_ed + 1); ed_seed_nums.fillZero();
1558     EList<size_t> next_ed_seed_nums; next_ed_seed_nums.resize(max_ed + 1);
1559 
1560     if(max_count < rp.seed_count)
1561         return;
1562 
1563     for(size_t i = sb; i < se; i++) seeds[i].ed = 0;
1564     size_t seed_ext_len = 0;
1565     while(seed_ext_len < max(min_left_ext, min_right_ext)) {
1566         // select base to be used for extension
1567         uint8_t left_ext_base = 0;
1568         if(seed_ext_len < min_left_ext) left_ext_base = seqs[max_i][min_left_ext - seed_ext_len - 1];
1569         uint8_t right_ext_base = 0;
1570         if(seed_ext_len < min_right_ext) right_ext_base = seqs[max_i][min_left_ext + seed_ext_len];
1571 
1572         // estimate extended ed
1573         next_ed_seed_nums.fillZero();
1574         for(size_t i = sb; i < se; i++) {
1575             uint32_t est_ed = seeds[i].ed;
1576             if(seed_ext_len < min_left_ext) {
1577                 if(seeds[i].bound.first + seed_ext_len + 1 <= seeds[i].pos.first) {
1578                     TIndexOffU left_pos = seeds[i].pos.first - seed_ext_len - 1;
1579                     int ch = getSequenceBase(s, left_pos);
1580                     assert_range(0, 3, ch);
1581                     if ("ACGT"[ch] != left_ext_base) {
1582                         est_ed++;
1583                     }
1584                 } else {
1585                     est_ed = max_ed + 1;
1586                 }
1587             }
1588 
1589             if(seed_ext_len < min_right_ext) {
1590                 TIndexOffU right_pos = seeds[i].pos.second + seed_ext_len;
1591                 if(right_pos >= seeds[i].bound.second) {
1592                     est_ed = max_ed + 1;
1593                 } else {
1594                     int ch = getSequenceBase(s, right_pos);
1595                     assert_range(0, 3, ch);
1596                     if ("ACGT"[ch] != right_ext_base) {
1597                         est_ed++;
1598                     }
1599                 }
1600             }
1601 
1602             if(est_ed <= max_ed) {
1603                 next_ed_seed_nums[est_ed]++;
1604             }
1605         }
1606 
1607         for(size_t i = 1; i < next_ed_seed_nums.size(); i++) next_ed_seed_nums[i] += next_ed_seed_nums[i-1];
1608         if(next_ed_seed_nums[max_ed] < rp.repeat_count) {
1609             break;
1610         }
1611 
1612         for(size_t i = sb; i < se; i++) {
1613             if(seed_ext_len < min_left_ext) {
1614                 if(seeds[i].bound.first + seed_ext_len + 1 <= seeds[i].pos.first) {
1615                     TIndexOffU left_pos = seeds[i].pos.first - seed_ext_len - 1;
1616                     int ch = getSequenceBase(s, left_pos);
1617                     assert_range(0, 3, ch);
1618                     if ("ACGT"[ch] != left_ext_base) {
1619                         seeds[i].ed++;
1620                     }
1621                 } else {
1622                     seeds[i].ed = max_ed + 1;
1623                 }
1624             }
1625 
1626             if(seed_ext_len < min_right_ext) {
1627                 TIndexOffU right_pos = seeds[i].pos.second + seed_ext_len;
1628                 if(right_pos >= seeds[i].bound.second) {
1629                     seeds[i].ed = max_ed + 1;
1630                 } else {
1631                     int ch = getSequenceBase(s, right_pos);
1632                     assert_range(0, 3, ch);
1633                     if ("ACGT"[ch] != right_ext_base) {
1634                         seeds[i].ed++;
1635                     }
1636                 }
1637             }
1638         }
1639 
1640         for(size_t i = 0; i < next_ed_seed_nums.size(); i++) {
1641             if(next_ed_seed_nums[i] < rp.repeat_count)
1642                 continue;
1643             ed_seed_nums[i] = next_ed_seed_nums[i];
1644             if(seed_ext_len < min_left_ext) {
1645                 assert(left_consensuses != NULL);
1646                 (*left_consensuses)[i] += left_ext_base;
1647             }
1648             if(seed_ext_len < min_right_ext) {
1649                 assert(right_consensuses != NULL);
1650                 (*right_consensuses)[i] += right_ext_base;
1651             }
1652         }
1653 
1654         seed_ext_len++;
1655     }
1656 }
1657 
1658 template<typename TStr>
get_consensus_seq(const TStr & s,EList<SeedExt> & seeds,size_t sb,size_t se,size_t min_left_ext,size_t min_right_ext,size_t max_ed,const RepeatParameter & rp,EList<size_t> & ed_seed_nums,EList<string> * left_consensuses,EList<string> * right_consensuses) const1659 void RB_RepeatExt::get_consensus_seq(const TStr& s,
1660                                      EList<SeedExt>& seeds,
1661                                      size_t sb,
1662                                      size_t se,
1663                                      size_t min_left_ext,
1664                                      size_t min_right_ext,
1665                                      size_t max_ed,
1666                                      const RepeatParameter& rp,
1667                                      EList<size_t>& ed_seed_nums,
1668                                      EList<string>* left_consensuses,
1669                                      EList<string>* right_consensuses) const
1670 {
1671     assert_lt(sb, se);
1672     assert_leq(se, seeds.size());
1673     if(left_consensuses != NULL) {
1674         left_consensuses->clear(); left_consensuses->resize(max_ed + 1);
1675         for(size_t i = 0; i < max_ed + 1; i++) {
1676             (*left_consensuses)[i].clear();
1677         }
1678     }
1679     if(right_consensuses != NULL) {
1680         right_consensuses->clear(); right_consensuses->resize(max_ed + 1);
1681         for(size_t i = 0; i < max_ed + 1; i++) {
1682             (*right_consensuses)[i].clear();
1683         }
1684     }
1685 
1686     // cluster sequences
1687     EList<size_t> belongto;
1688     belongto.resizeExact(se - sb);
1689     for(size_t i = 0; i < belongto.size(); i++) belongto[i] = i;
1690 
1691     for(size_t i = 0; i + 1 < belongto.size(); i++) {
1692         for(size_t j = i + 1; j < belongto.size(); j++) {
1693             if(belongto[j] != j) continue;
1694             size_t ed = calc_edit_dist(s,
1695                                        seeds[sb + i],
1696                                        seeds[sb + j],
1697                                        min_left_ext,
1698                                        min_right_ext,
1699                                        max_ed + 1);
1700             if(ed <= max_ed + 1) {
1701                 belongto[j] = belongto[i];
1702             }
1703 
1704         }
1705     }
1706 
1707     // find the maximum group that has the most sequences
1708     EList<size_t> vote;
1709     vote.resizeExact(seeds.size());
1710     vote.fillZero();
1711     size_t max_group = 0;
1712     for(size_t i = 0; i < belongto.size(); i++) {
1713         size_t cur_group = belongto[i];
1714         assert_lt(cur_group, vote.size());
1715         vote[cur_group]++;
1716         if(cur_group != max_group && vote[cur_group] > vote[max_group]) {
1717             max_group = cur_group;
1718         }
1719     }
1720 
1721     // reuse vote to store seeds
1722     EList<size_t>& consensus_group = vote;
1723     consensus_group.clear();
1724     for(size_t i = 0; i < belongto.size(); i++) {
1725         if(belongto[i] == max_group)
1726             consensus_group.push_back(i);
1727     }
1728 
1729     // update ed
1730     // extend consensus string
1731     ed_seed_nums.resize(max_ed + 1); ed_seed_nums.fillZero();
1732     EList<size_t> next_ed_seed_nums; next_ed_seed_nums.resize(max_ed + 1);
1733     for(size_t i = sb; i < se; i++) seeds[i].ed = 0;
1734     size_t seed_ext_len = 0;
1735     while(seed_ext_len < max(min_left_ext, min_right_ext)) {
1736         // count base
1737         size_t l_count[4] = {0, };
1738         size_t r_count[4] = {0, };
1739 
1740         for(size_t i = 0; i < consensus_group.size(); i++) {
1741             size_t si = sb + consensus_group[i];
1742             int ch;
1743             if(seed_ext_len < min_left_ext) {
1744                 if(seeds[i].bound.first + seed_ext_len + 1 <= seeds[i].pos.first) {
1745                     ch = getSequenceBase(s, seeds[si].pos.first - seed_ext_len - 1);
1746                     assert_range(0, 3, ch);
1747                     l_count[ch]++;
1748                 }
1749             }
1750             if(seed_ext_len < min_right_ext) {
1751                 if(seeds[i].bound.second + seed_ext_len) {
1752                     ch = getSequenceBase(s, seeds[si].pos.second + seed_ext_len);
1753                     assert_range(0, 3, ch);
1754                     r_count[ch]++;
1755                 }
1756             }
1757         }
1758 
1759         // select base to be used for extension
1760         uint8_t left_ext_base = 0;
1761         if(seed_ext_len < min_left_ext) left_ext_base = max_index(l_count);
1762         uint8_t right_ext_base = 0;
1763         if(seed_ext_len < min_right_ext) right_ext_base = max_index(r_count);
1764 
1765         // estimate extended ed
1766         next_ed_seed_nums.fillZero();
1767         for(size_t i = sb; i < se; i++) {
1768             uint32_t est_ed = seeds[i].ed;
1769             if(seed_ext_len < min_left_ext) {
1770                 if(seeds[i].bound.first + seed_ext_len + 1 <= seeds[i].pos.first) {
1771                     TIndexOffU left_pos = seeds[i].pos.first - seed_ext_len - 1;
1772                     int ch = getSequenceBase(s, left_pos);
1773                     assert_range(0, 3, ch);
1774                     if (ch != left_ext_base) {
1775                         est_ed++;
1776                     }
1777                 } else {
1778                     est_ed = max_ed + 1;
1779                 }
1780             }
1781 
1782             if(seed_ext_len < min_right_ext) {
1783                 TIndexOffU right_pos = seeds[i].pos.second + seed_ext_len;
1784                 if(right_pos >= seeds[i].bound.second) {
1785                     est_ed = max_ed + 1;
1786                 } else {
1787                     int ch = getSequenceBase(s, right_pos);
1788                     assert_range(0, 3, ch);
1789                     if (ch != right_ext_base) {
1790                         est_ed++;
1791                     }
1792                 }
1793             }
1794 
1795             if(est_ed <= max_ed) {
1796                 next_ed_seed_nums[est_ed]++;
1797             }
1798         }
1799 
1800         for(size_t i = 1; i < next_ed_seed_nums.size(); i++) next_ed_seed_nums[i] += next_ed_seed_nums[i-1];
1801         if(next_ed_seed_nums[max_ed] < rp.repeat_count) {
1802             break;
1803         }
1804 
1805         for(size_t i = sb; i < se; i++) {
1806             if(seed_ext_len < min_left_ext) {
1807                 if(seeds[i].bound.first + seed_ext_len + 1 <= seeds[i].pos.first) {
1808                     TIndexOffU left_pos = seeds[i].pos.first - seed_ext_len - 1;
1809                     int ch = getSequenceBase(s, left_pos);
1810                     assert_range(0, 3, ch);
1811                     if (ch != left_ext_base) {
1812                         seeds[i].ed++;
1813                     }
1814                 } else {
1815                     seeds[i].ed = max_ed + 1;
1816                 }
1817             }
1818 
1819             if(seed_ext_len < min_right_ext) {
1820                 TIndexOffU right_pos = seeds[i].pos.second + seed_ext_len;
1821                 if(right_pos >= seeds[i].bound.second) {
1822                     seeds[i].ed = max_ed + 1;
1823                 } else {
1824                     int ch = getSequenceBase(s, right_pos);
1825                     assert_range(0, 3, ch);
1826                     if (ch != right_ext_base) {
1827                         seeds[i].ed++;
1828                     }
1829                 }
1830             }
1831         }
1832 
1833         for(size_t i = 0; i < next_ed_seed_nums.size(); i++) {
1834             if(next_ed_seed_nums[i] < rp.repeat_count)
1835                 continue;
1836             ed_seed_nums[i] = next_ed_seed_nums[i];
1837             if(seed_ext_len < min_left_ext) {
1838                 assert(left_consensuses != NULL);
1839                 (*left_consensuses)[i] += (char)("ACGT"[left_ext_base]);
1840             }
1841             if(seed_ext_len < min_right_ext) {
1842                 assert(right_consensuses != NULL);
1843                 (*right_consensuses)[i] += (char)("ACGT"[right_ext_base]);
1844             }
1845         }
1846 
1847         seed_ext_len++;
1848     }
1849 }
1850 
1851 EList<size_t> RB_Repeat::ca_ed_;
1852 EList<size_t> RB_Repeat::ca_ed2_;
1853 string RB_Repeat::ca_s_;
1854 string RB_Repeat::ca_s2_;
1855 
1856 size_t RB_Repeat::seed_merge_tried = 0;
1857 size_t RB_Repeat::seed_merged = 0;
1858 
1859 template<typename TStr>
extendConsensus(const RepeatParameter & rp,const TStr & s)1860 void RB_RepeatExt::extendConsensus(const RepeatParameter& rp,
1861                                    const TStr& s)
1862 {
1863     size_t remain = seeds_.size();
1864     EList<string> consensuses, empty_consensuses;
1865     EList<size_t> ed_seed_nums;
1866     bool left = true;
1867 
1868     const TIndexOffU default_max_ext_len = 100;
1869     const TIndexOffU seed_mm = 4;
1870     string ext_consensus = "";
1871 
1872     empty_consensuses.resize(seed_mm + 1);
1873     empty_consensuses.fill("");
1874 
1875     while(remain >= rp.repeat_count) {
1876         for(size_t i = 0; i < remain; i++) {
1877             seeds_[i].done = false;
1878             seeds_[i].curr_ext_len = 0;
1879         }
1880 
1881         // extend seeds in left or right direction
1882         TIndexOffU max_ext_len = min(default_max_ext_len, (TIndexOffU)(rp.max_repeat_len - consensus_.length()));
1883         get_consensus_seq(s,
1884                           seeds_,
1885                           0,      // seed begin
1886                           remain, // seed end
1887                           left ? max_ext_len : 0,
1888                           left ? 0 : max_ext_len,
1889                           seed_mm,
1890                           rp,
1891                           ed_seed_nums,
1892                           left ? &consensuses : NULL,
1893                           left ? NULL : &consensuses);
1894 
1895         size_t allowed_seed_mm = 0;
1896         ext_consensus.clear();
1897         for(int i = (int)seed_mm; i >= 0; i--) {
1898             size_t extlen = (ed_seed_nums[i] < rp.repeat_count ? 0 : consensuses[i].length());
1899             if(extlen <= 0 || extlen < max_ext_len * i / seed_mm)
1900                 continue;
1901 
1902             if(i > 0 && consensuses[i].length() <= consensuses[i-1].length() + 5)
1903                 continue;
1904 
1905             ext_consensus = consensuses[i];
1906             allowed_seed_mm = (size_t)i;
1907             assert(ext_consensus.length() > 0);
1908             break;
1909         }
1910         size_t num_passed_seeds = 0;
1911         if(ext_consensus.length() > 0) {
1912             if(left) consensus_ = reverse(ext_consensus) + consensus_;
1913             else consensus_ += ext_consensus;
1914 
1915             calc_edit_dist(s,
1916                            seeds_,
1917                            0,
1918                            remain,
1919                            left ? consensuses : empty_consensuses,
1920                            left ? empty_consensuses : consensuses,
1921                            allowed_seed_mm);
1922 
1923             // update seeds
1924             for(size_t i = 0; i < seeds_.size(); i++) {
1925                 SeedExt& seed = seeds_[i];
1926 
1927                 if(i < remain) {
1928                     if(seed.ed <= allowed_seed_mm) {
1929                         num_passed_seeds++;
1930                         seed.done = true;
1931                         seed.total_ed += seed.ed;
1932                         if(left) {
1933                             if(seed.left_gaps.size() > 0 &&
1934                                seed.left_gaps.back().first >= seed.orig_pos.first - seed.pos.first) {
1935                                 int gap_len = seed.left_gaps.back().second;
1936                                 seed.pos.first += gap_len;
1937                             }
1938                             seed.pos.first -= ext_consensus.length();
1939                             seed.consensus_pos.first = 0;
1940                             seed.consensus_pos.second = consensus_.length();
1941                         } else {
1942                             if(seed.right_gaps.size() > 0 &&
1943                                seed.right_gaps.back().first >= seed.pos.second - seed.orig_pos.second) {
1944                                 int gap_len = seed.right_gaps.back().second;
1945                                 seed.pos.second -= gap_len;
1946                             }
1947                             seed.pos.second += ext_consensus.length();
1948                             seed.consensus_pos.second = consensus_.length();
1949                         }
1950                     } else {
1951                         if(left) {
1952                             assert_leq(seed.curr_ext_len, ext_consensus.length());
1953                             TIndexOffU adjust = ext_consensus.length() - seed.curr_ext_len;
1954                             seed.consensus_pos.first += adjust;
1955                             seed.consensus_pos.second += ext_consensus.length();
1956                             assert_leq(seed.curr_ext_len, seed.pos.first);
1957                             seed.pos.first -= seed.curr_ext_len;
1958                         } else {
1959                             assert_leq(seed.curr_ext_len, ext_consensus.length());
1960                             seed.consensus_pos.second += seed.curr_ext_len;
1961                             seed.pos.second += seed.curr_ext_len;
1962                         }
1963                     }
1964                 } else {
1965                     if(left) {
1966                         seed.consensus_pos.first += ext_consensus.length();
1967                         seed.consensus_pos.second += ext_consensus.length();
1968                     }
1969                 }
1970             }
1971 
1972             // move up "done" seeds
1973             size_t j = 0;
1974             for(size_t i = 0; i < remain; i++) {
1975                 if(!seeds_[i].done) continue;
1976                 assert_geq(i, j);
1977                 if(i > j) {
1978                     SeedExt temp = seeds_[j];
1979                     seeds_[j] = seeds_[i];
1980                     seeds_[i] = temp;
1981                     // Find next "undone" seed
1982                     j++;
1983                     while(j < i && seeds_[j].done) {
1984                         j++;
1985                     }
1986                     assert(j < remain && !seeds_[j].done);
1987                 } else {
1988                     j = i + 1;
1989                 }
1990             }
1991         }
1992 
1993         remain = num_passed_seeds;
1994         if(remain < rp.repeat_count) {
1995             if(left) {
1996                 left = false;
1997                 remain = seeds_.size();
1998             }
1999         }
2000     } // while(remain >= rp.repeat_count)
2001 
2002 #ifndef NDEBUG
2003     // make sure seed positions are unique
2004     EList<pair<TIndexOffU, TIndexOffU> > seed_poses;
2005     for(size_t i = 0; i < seeds_.size(); i++) {
2006         seed_poses.expand();
2007         seed_poses.back() = seeds_[i].orig_pos;
2008     }
2009     seed_poses.sort();
2010     for(size_t i = 0; i + 1 < seed_poses.size(); i++) {
2011         if(seed_poses[i].first == seed_poses[i+1].first) {
2012             assert_lt(seed_poses[i].second, seed_poses[i+1].second);
2013         } else {
2014             assert_lt(seed_poses[i].first, seed_poses[i+1].first);
2015         }
2016     }
2017 
2018     for(size_t i = 0; i < seeds_.size(); i++) {
2019         assert(seeds_[i].valid());
2020     }
2021 #endif
2022 
2023     internal_update();
2024 
2025     // check repeats within a repeat sequence
2026     self_repeat_ = false;
2027     for(size_t i = 0; i + 1 < seed_ranges_.size(); i++) {
2028         const RB_AlleleCoord& range = seed_ranges_[i];
2029         for(size_t j = i + 1; j < seed_ranges_.size(); j++) {
2030             RB_AlleleCoord& range2 = seed_ranges_[j];
2031             if(range.right <= range2.left)
2032                 break;
2033             self_repeat_  = true;
2034         }
2035     }
2036 }
2037 
overlap(const RB_Repeat & o,bool & contain,bool & left,size_t & seed_i,size_t & seed_j,bool debug) const2038 bool RB_Repeat::overlap(const RB_Repeat& o,
2039                         bool& contain,
2040                         bool& left,
2041                         size_t& seed_i,
2042                         size_t& seed_j,
2043                         bool debug) const
2044 {
2045     contain = left = false;
2046     seed_i = seed_j = 0;
2047     size_t p = 0, p2 = 0;
2048     while(p < seed_ranges_.size() && p2 < o.seed_ranges_.size()) {
2049         const RB_AlleleCoord& range = seed_ranges_[p];
2050         const SeedExt& seed = seeds_[range.idx];
2051         Range range_ = seed.getExtendedRange(consensus_.length());
2052         RB_AlleleCoord ex_range(range_.first, range_.second, p);
2053         bool representative = (float)range.len() >= consensus_.length() * 0.95f;
2054 
2055         const RB_AlleleCoord& range2 = o.seed_ranges_[p2];
2056         const SeedExt& seed2 = o.seeds_[range2.idx];
2057         Range range2_ = seed2.getExtendedRange(o.consensus().length());
2058         RB_AlleleCoord ex_range2(range2_.first, range2_.second, p2);
2059         bool representative2 = (float)range2.len() >= o.consensus_.length() * 0.95f;
2060 
2061         seed_i = range.idx;
2062         seed_j = range2.idx;
2063 
2064         const size_t relax = 5;
2065         if(representative && representative2) {
2066             if(ex_range.overlap_len(ex_range2) > 0) {
2067                 if(ex_range.contain(ex_range2, relax)) {
2068                     contain = true;
2069                     left = true;
2070                 } else if(ex_range2.contain(ex_range, relax)) {
2071                     contain = true;
2072                     left = false;
2073                 } else {
2074                     left = ex_range.left <= ex_range2.left;
2075                 }
2076                 return true;
2077             }
2078         } else if(representative) {
2079             if(range2.contain(ex_range, relax)) {
2080                 contain = true;
2081                 left = false;
2082                 return true;
2083             }
2084         } else if(representative2) {
2085             if(range.contain(ex_range2, relax)) {
2086                 contain = true;
2087                 left = true;
2088                 return true;
2089             }
2090         }
2091 
2092         if(range.right <= range2.right) p++;
2093         if(range2.right <= range.right) p2++;
2094     }
2095     return false;
2096 }
2097 
showInfo(const RepeatParameter & rp,CoordHelper & coordHelper) const2098 void RB_Repeat::showInfo(const RepeatParameter& rp,
2099                          CoordHelper& coordHelper) const
2100 {
2101     cerr << "\trepeat id: " << repeat_id_ << endl;
2102     cerr << "\tnumber of alleles: " << seeds_.size() << endl;
2103     cerr << "\tconsensus length: " << consensus_.length() << endl;
2104     cerr << consensus_ << endl;
2105     for(size_t i = 0; i < seeds_.size(); i++) {
2106         const SeedExt& seed = seeds_[i];
2107         size_t ext_len = seed.getLength();
2108         if(ext_len < rp.min_repeat_len) continue;
2109         bool sense_strand = seed.pos.first < coordHelper.forward_length();
2110 
2111         cerr << "\t\t" << setw(4) << i << " " << setw(4) << ext_len;
2112         cerr << " " << (sense_strand ? '+' : '-');
2113         cerr << " " << setw(10) << seed.pos.first << "  " << setw(10) << seed.pos.second;
2114         cerr << " " << setw(4) << seed.consensus_pos.first << "  " << setw(4) << seed.consensus_pos.second;
2115         cerr << " " << (seed.aligned ? "aligned" : "unaligned");
2116         cerr << endl;
2117     }
2118 }
2119 
2120 template<typename TStr>
generateSNPs(const RepeatParameter & rp,const TStr & s,TIndexOffU grp_id)2121 void RB_Repeat::generateSNPs(const RepeatParameter& rp, const TStr& s, TIndexOffU grp_id) {
2122     EList<SeedExt>& seeds = this->seeds();
2123 
2124     for(size_t i = 0; i < seeds.size(); i++) {
2125         SeedExt& seed = seeds[i];
2126 
2127         if(!seed.aligned) {
2128             continue;
2129         }
2130 
2131         if(seed.getLength() < rp.min_repeat_len) {
2132             continue;
2133         }
2134         assert_eq(seed.getLength(), seed.pos.second - seed.pos.first);
2135         seed.generateSNPs(s, consensus(), snps());
2136     }
2137     if(snps().size() > 0) {
2138         sort(snps_.begin(), snps().end(), SeedSNP::cmpSeedSNPByPos);
2139     }
2140 }
2141 
saveSNPs(ofstream & fp,TIndexOffU grp_id,TIndexOffU & snp_id_base)2142 void RB_Repeat::saveSNPs(ofstream &fp, TIndexOffU grp_id, TIndexOffU& snp_id_base)
2143 {
2144     string chr_name = "rep" + to_string(repeat_id_);
2145 
2146     for(size_t j = 0; j < snps_.size(); j++) {
2147         SeedSNP& snp = *snps_[j];
2148         // assign SNPid
2149         snp.id = (snp_id_base++);
2150 
2151         fp << "rps" << snp.id;
2152         fp << "\t";
2153 
2154         if(snp.type == EDIT_TYPE_MM) {
2155             fp << "single";
2156         } else if(snp.type == EDIT_TYPE_READ_GAP) {
2157             fp << "deletion";
2158         } else if(snp.type == EDIT_TYPE_REF_GAP) {
2159             fp << "insertion";
2160         } else {
2161             assert(false);
2162         }
2163         fp << "\t";
2164 
2165         fp << chr_name;
2166         fp << "\t";
2167 
2168         fp << snp.pos;
2169         fp << "\t";
2170         if(snp.type == EDIT_TYPE_MM || snp.type == EDIT_TYPE_REF_GAP) {
2171             fp << snp.base;
2172         } else if(snp.type == EDIT_TYPE_READ_GAP) {
2173             fp << snp.len;
2174         } else {
2175             assert(false);
2176         }
2177         fp << endl;
2178     }
2179 }
2180 
mergeable(const RB_Repeat & o) const2181 float RB_RepeatExt::mergeable(const RB_Repeat& o) const
2182 {
2183     const EList<RB_AlleleCoord>& ranges = seed_ranges();
2184     const EList<RB_AlleleCoord>& ranges2 = o.seed_ranges();
2185     size_t num_overlap_bp = 0;
2186     size_t p = 0, p2 = 0;
2187     while(p < ranges.size() && p2 < ranges2.size()) {
2188         const RB_AlleleCoord& range = ranges[p];
2189         const RB_AlleleCoord& range2 = ranges2[p2];
2190         TIndexOffU overlap = range.overlap_len(range2);
2191         num_overlap_bp += overlap;
2192         if(range.right <= range2.right) p++;
2193         else                            p2++;
2194     }
2195     size_t num_total_bp = 0, num_total_bp2 = 0;
2196     for(size_t r = 0; r < ranges.size(); r++) num_total_bp += (ranges[r].right - ranges[r].left);
2197     for(size_t r = 0; r < ranges2.size(); r++) num_total_bp2 += (ranges2[r].right - ranges2[r].left);
2198     float portion = float(num_overlap_bp) / float(min(num_total_bp, num_total_bp2));
2199     return portion;
2200 }
2201 
get_next_range(const EList<int> & offsets,size_t i,Range & r,float & avg)2202 inline void get_next_range(const EList<int>& offsets, size_t i, Range& r, float& avg)
2203 {
2204     r.first = i;
2205     for(; r.first < offsets.size() && offsets[r.first] < 0; r.first++);
2206     r.second = r.first + 1;
2207     avg = 0.0f;
2208     if(r.first < offsets.size()) {
2209         float diff = (float)offsets[r.first] - (float)r.first;
2210         avg += diff;
2211     }
2212     for(; r.second < offsets.size() &&
2213         offsets[r.second] >= 0 &&
2214         (r.second == 0 || offsets[r.second] >= offsets[r.second - 1]);
2215         r.second++) {
2216         float diff = (float)offsets[r.second] - (float)r.second;
2217         avg += diff;
2218     }
2219     avg /= (float)(r.second - r.first);
2220     return;
2221 }
2222 
2223 template<typename TStr>
align(const RepeatParameter & rp,const TStr & ref,const string & s,const EList<pair<size_t,size_t>> & s_kmer_table,const string & s2,EList<int> & offsets,size_t k,SeedExt & seed,int consensus_approx_left,int consensus_approx_right,size_t left,size_t right,bool debug)2224 bool RB_RepeatExt::align(const RepeatParameter& rp,
2225                          const TStr& ref,
2226                          const string& s,
2227                          const EList<pair<size_t, size_t> >& s_kmer_table,
2228                          const string& s2,
2229                          EList<int>& offsets,
2230                          size_t k,
2231                          SeedExt& seed,
2232                          int consensus_approx_left,
2233                          int consensus_approx_right,
2234                          size_t left,
2235                          size_t right,
2236                          bool debug)
2237 {
2238     RB_Repeat::seed_merge_tried++;
2239 
2240     seed.reset();
2241     seed.pos.first = left;
2242     seed.pos.second = right;
2243     seed.orig_pos.first = seed.pos.first;
2244     seed.orig_pos.second = seed.orig_pos.first;
2245     seed.consensus_pos.first = 0;
2246     seed.consensus_pos.second = consensus_.length();
2247     seed.aligned = false;
2248 
2249     {
2250         const int query_len = right - left;
2251         const int consensus_len = consensus_approx_right - consensus_approx_left;
2252         const int abs_gap_len = max<int>(abs(consensus_len - query_len), 5);
2253 
2254         offsets.resize(s2.length());
2255         offsets.fill(-1);
2256         pair<size_t, size_t> kmer(0, 0);
2257         for(size_t i = 0; i + k <= s2.length(); i++) {
2258             if(i == 0) {
2259                 kmer.first = extract_kmer(s2, i, k);
2260             } else {
2261                 kmer.first = next_kmer(kmer.first, s2[i+k-1], k);
2262             }
2263             size_t lb = s_kmer_table.bsearchLoBound(kmer);
2264             while(lb < s_kmer_table.size() && kmer.first == s_kmer_table[lb].first) {
2265                 int expected = (int)i + consensus_approx_left;
2266                 int real = (int)s_kmer_table[lb].second;
2267                 int abs_diff = abs(expected - real);
2268                 if(abs_diff <= abs_gap_len * 2 || debug) {
2269                     if(offsets[i] == -1) {
2270                         offsets[i] = (int)s_kmer_table[lb].second;
2271                     } else if(offsets[i] >= 0) {
2272                         offsets[i] = -2;
2273                     } else {
2274                         assert_lt(offsets[i], -1);
2275                         offsets[i] -= 1;
2276                     }
2277                 }
2278                 lb++;
2279             }
2280             if(offsets[i] > 0 && i + k == s2.length()) {
2281                 for(size_t j = i + 1; j < s2.length(); j++) {
2282                     offsets[j] = offsets[j-1] + 1;
2283                 }
2284             }
2285         }
2286     }
2287 
2288     if(debug) {
2289         cerr << "initial offsets" << endl;
2290         for(size_t j = 0; j < offsets.size(); j++) {
2291             cout << j << ": " << offsets[j] << " " << s2[j] << ": " << (offsets[j] < 0 ? ' ' : s[offsets[j]]) << endl;
2292         }
2293     }
2294 
2295     // remove inconsistent positions
2296     Range range; float range_avg;
2297     get_next_range(offsets, 0, range, range_avg);
2298     while(range.second < offsets.size()) {
2299         Range range2; float range_avg2;
2300         get_next_range(offsets, range.second, range2, range_avg2);
2301         if(range2.first >= offsets.size())
2302             break;
2303 
2304         assert_leq(range.second, range2.first);
2305 
2306         float abs_diff = abs(range_avg - range_avg2);
2307         if(offsets[range.second - 1] > offsets[range2.first] ||
2308            (abs_diff > 10.0f && (range.second - range.first < 5 || range2.second - range2.first < 5)) ||
2309            abs_diff > 20.0f) {
2310             if(range.second - range.first < range2.second - range2.first) {
2311                 for(size_t i = range.first; i < range.second; i++) {
2312                     offsets[i] = -1;
2313                 }
2314                 range = range2;
2315                 range_avg = range_avg2;
2316             } else {
2317                 for(size_t i = range2.first; i < range2.second; i++) {
2318                     offsets[i] = -1;
2319                 }
2320             }
2321         } else {
2322             range = range2;
2323             range_avg = range_avg2;
2324         }
2325     }
2326 
2327     bool weighted_avg_inited = false;
2328     float weighted_avg = -1.0f;
2329     for(size_t i = 0; i < offsets.size(); i++) {
2330         if(offsets[i] < 0)
2331             continue;
2332 
2333         float diff = (float)offsets[i] - (float)i;
2334         if(weighted_avg_inited) {
2335             if(abs(diff - weighted_avg) > 20.0f) {
2336                 offsets[i] = -1;
2337                 continue;
2338             }
2339         }
2340 
2341         if(weighted_avg < 0.0f) {
2342             weighted_avg = diff;
2343             weighted_avg_inited = true;
2344         } else {
2345             weighted_avg = 0.8f * weighted_avg + 0.2f * diff;
2346         }
2347     }
2348 
2349     if(debug) {
2350         cerr << "after filtering" << endl;
2351         for(size_t j = 0; j < offsets.size(); j++) {
2352             cout << j << ": " << offsets[j] << " " << s2[j] << ": " << (offsets[j] < 0 ? ' ' : s[offsets[j]]) << endl;
2353         }
2354     }
2355 
2356     int i = 0;
2357     while(i < offsets.size()) {
2358         // skip non-negative offsets
2359         for(; i < offsets.size() && offsets[i] >= 0; i++);
2360         if(i >= offsets.size()) break;
2361         int j = i;
2362         for(; j < offsets.size(); j++) {
2363             if(offsets[j] >= 0)
2364                 break;
2365         }
2366         assert(i >= offsets.size() || offsets[i] < 0);
2367         assert(j >= offsets.size() || offsets[j] >= 0);
2368         if(i > 0 && j < offsets.size()) {
2369             i -= 1;
2370             int left = offsets[i], right = offsets[j];
2371             assert_geq(left, 0);
2372             if(left > right) return false;
2373             assert_leq(left, right);
2374             int ref_len = right - left + 1;
2375             int query_len = j - i + 1;
2376             if(query_len == ref_len) { // match
2377                 for(size_t i2 = i + 1; i2 < j; i2++)
2378                     offsets[i2] = offsets[i2-1] + 1;
2379             } else { // deletion or insertion
2380                 bool del = query_len < ref_len;
2381                 size_t gap_len = del ? ref_len - query_len : query_len - ref_len;
2382                 size_t max_len = max(ref_len, query_len);
2383                 const size_t max_mm = max_len / 25 + 1;
2384                 const size_t very_max_mm = max<size_t>(max_len / 2, max_mm);
2385                 ca_s_ = s.substr(left, max_len);
2386                 ca_s2_ = s2.substr(i, max_len);
2387 
2388                 size_t gap_pos = 0, mm = very_max_mm + 1;
2389                 find_gap_pos(ca_s_,
2390                              ca_s2_,
2391                              ca_ed_,
2392                              ca_ed2_,
2393                              del,
2394                              gap_len,
2395                              very_max_mm,
2396                              gap_pos,
2397                              mm,
2398                              debug);
2399 
2400                 if(mm > very_max_mm) {
2401                     return false;
2402                 }
2403 
2404                 assert_lt(gap_pos, query_len);
2405                 if(del) {
2406                     for(size_t i2 = i + 1; i2 < j; i2++) {
2407                         if(i2 - i == gap_pos) {
2408                             offsets[i2] = offsets[i2-1] + gap_len;
2409                         } else {
2410                             offsets[i2] = offsets[i2-1] + 1;
2411                         }
2412                     }
2413                 } else {
2414                     for(size_t i2 = i + 1; i2 < j; i2++) {
2415                         if(i2 - i >= gap_pos && i2 - i < gap_pos + gap_len) {
2416                             offsets[i2] = offsets[i2-1];
2417                         } else {
2418                             offsets[i2] = offsets[i2-1] + 1;
2419                         }
2420                     }
2421                 }
2422             }
2423         }
2424 
2425         i = j;
2426     }
2427 
2428     if(debug) {
2429         cerr << "final offsets" << endl;
2430         for(size_t j = 0; j < offsets.size(); j++) {
2431             cout << j << ": " << offsets[j] << " " << s2[j] << ": " << (offsets[j] < 0 ? ' ' : s[offsets[j]]) << endl;
2432         }
2433     }
2434 
2435 #ifndef NDEBUG
2436     for(size_t i = 1; i < offsets.size(); i++) {
2437         if(offsets[i-1] < 0 || offsets[i] < 0) continue;
2438         assert_leq(offsets[i-1], offsets[i]);
2439     }
2440 #endif
2441 
2442     size_t b, e;
2443     for(b = 0; b < offsets.size() && offsets[b] < 0; b++);
2444     if(b >= offsets.size()) return false;
2445     assert_lt((size_t)b, offsets.size());
2446     for(e = offsets.size() - 1; e > b && offsets[e] < 0; e--);
2447     if(b == e) return false;
2448     for(size_t p = b; p <= e; p++) {
2449         if(offsets[p] < 0) return false;
2450     }
2451 
2452     // try to fill the ends
2453     if(b > 0) {
2454         int mm = 0, pb = (int)b;
2455         for(int i = pb - 1; i >= 0; i--) {
2456             assert_geq(offsets[i+1], 0);
2457             if(offsets[i+1] == 0) break;
2458             if(s2[i] != s[offsets[i+1] - 1])
2459                 mm++;
2460             if(pb - i < 25 * (mm - 1))
2461                 break;
2462             offsets[i] = offsets[i+1] - 1;
2463             b = (size_t)i;
2464         }
2465     }
2466     if(e + 1 < offsets.size()) {
2467         int mm = 0, prev_end = (int)e;
2468         for(int i = prev_end + 1; i < offsets.size(); i++) {
2469             assert_geq(offsets[i-1], 0);
2470             if(offsets[i-1] + 1 >= s.length()) break;
2471             if(s2[i] != s[offsets[i-1] + 1])
2472                 mm++;
2473             if(i - prev_end < 25 * (mm - 1))
2474                 break;
2475             offsets[i] = offsets[i-1] + 1;
2476             e = (size_t)i;
2477         }
2478     }
2479 
2480     assert_geq(seed.pos.first, (TIndexOffU)b);
2481     seed.pos.first += (TIndexOffU)b;
2482     seed.pos.second = seed.pos.first + e - b + 1;
2483     seed.orig_pos.first = seed.pos.first;
2484     seed.orig_pos.second = seed.pos.first;
2485     seed.consensus_pos.first = offsets[b];
2486     seed.consensus_pos.second = offsets[e] + 1;
2487     seed.aligned = true;
2488 
2489     for(int p = b; p < e; p++) {
2490         assert_geq(offsets[p], 0);
2491         assert_leq(offsets[p], offsets[p+1]);
2492         if(offsets[p] + 1 == offsets[p+1]) { // match
2493             continue;
2494         } else if(offsets[p] + 1 < offsets[p+1]) { // deletion
2495             seed.right_gaps.expand();
2496             seed.right_gaps.back().first = p + 1 - b;
2497             seed.right_gaps.back().second = offsets[p+1] - offsets[p] - 1;
2498         } else {
2499             assert_eq(offsets[p], offsets[p+1]);
2500             int p2 = p + 1;
2501             for(; offsets[p2] == offsets[p2+1]; p2++);
2502             seed.right_gaps.expand();
2503             seed.right_gaps.back().first = p + 1 - b;
2504             seed.right_gaps.back().second = (int)p - (int)p2;
2505             p = p2;
2506         }
2507     }
2508 
2509     if(debug) {
2510         string consensus_str = consensus_.substr(seed.consensus_pos.first, seed.consensus_pos.second - seed.consensus_pos.first);
2511         string seed_str;
2512         seed.getExtendedSeedSequence(ref, seed_str);
2513         assert_eq(consensus_str.length(), seed_str.length());
2514         cerr << "consensus: " << consensus_str << endl;
2515         cerr << "seed     : " << seed_str << endl;
2516     }
2517 
2518     RB_Repeat::seed_merged++;
2519 
2520     return true;
2521 }
2522 
isSelfRepeat(const RepeatParameter & rp,const string & s,const EList<pair<size_t,size_t>> & s_kmer_table,EList<int> & offsets,size_t k,bool debug)2523 bool RB_RepeatExt::isSelfRepeat(const RepeatParameter& rp,
2524                                 const string& s,
2525                                 const EList<pair<size_t, size_t> >& s_kmer_table,
2526                                 EList<int>& offsets,
2527                                 size_t k,
2528                                 bool debug)
2529 {
2530     offsets.resize(s.length());
2531     offsets.fill(-1);
2532     pair<size_t, size_t> kmer(0, 0);
2533     for(size_t i = 0; i + k <= s.length(); i++) {
2534         if(i == 0) {
2535             kmer.first = extract_kmer(s, i, k);
2536         } else {
2537             kmer.first = next_kmer(kmer.first, s[i+k-1], k);
2538         }
2539         size_t lb = s_kmer_table.bsearchLoBound(kmer);
2540         while(lb < s_kmer_table.size() && kmer.first == s_kmer_table[lb].first) {
2541             if(offsets[i] == -1) {
2542                 offsets[i] = (int)s_kmer_table[lb].second;
2543             } else if(offsets[i] >= 0) {
2544                 offsets[i] = -2;
2545             } else {
2546                 assert_lt(offsets[i], -1);
2547                 offsets[i] -= 1;
2548             }
2549             lb++;
2550         }
2551         if(offsets[i] > 0 && i + k == s.length()) {
2552             for(size_t j = i + 1; j < s.length(); j++) {
2553                 offsets[j] = offsets[j-1] + 1;
2554             }
2555         }
2556     }
2557 
2558     if(debug) {
2559         cerr << "offsets" << endl;
2560         for(size_t j = 0; j < offsets.size(); j++) {
2561             cout << j << ": " << offsets[j] << " " << s[j] << ": " << (offsets[j] < 0 ? ' ' : s[offsets[j]]) << endl;
2562         }
2563     }
2564 
2565     const size_t min_repeat_size = 100;
2566     size_t repeat_count = 0;
2567     for(size_t i = 0; i + min_repeat_size < offsets.size(); i++) {
2568         if(offsets[i] >= -1)
2569             continue;
2570         size_t j = i + 1;
2571         for(; j < offsets.size() && offsets[j] <= -2; j++);
2572         if(j - i >= min_repeat_size)
2573             repeat_count++;
2574         i = j;
2575     }
2576 
2577     return repeat_count >= 2;
2578 }
2579 
2580 template<typename TStr>
merge(const RepeatParameter & rp,const TStr & s,RB_SWAligner & swalginer,const RB_RepeatExt & o,bool contain,size_t seed_i,size_t seed_j,bool debug)2581 bool RB_RepeatExt::merge(const RepeatParameter& rp,
2582                          const TStr& s,
2583                          RB_SWAligner& swalginer,
2584                          const RB_RepeatExt& o,
2585                          bool contain,
2586                          size_t seed_i,
2587                          size_t seed_j,
2588                          bool debug)
2589 {
2590     // construct a new consensus sequence
2591     string prev_consensus = consensus_;
2592     size_t consensus_add_len = 0;
2593     {
2594         assert_lt(seed_i, seeds_.size());
2595         assert_lt(seed_j, o.seeds_.size());
2596         const SeedExt& seed = seeds_[seed_i];
2597         const SeedExt& oseed = o.seeds_[seed_j];
2598 
2599         Range range = seed.getExtendedRange(consensus_.length());
2600         Range orange = oseed.getExtendedRange(o.consensus_.length());
2601         assert_leq(range.first, orange.first + 10);
2602 
2603         consensus_add_len = (int)orange.first - (int)range.first;
2604         if(!contain) {
2605             if(range.second <= orange.first) {
2606                 cerr << "something wrong: " << range.first << "-" << range.second << " ";
2607                 cerr << orange.first << "-" << orange.second << " " << o.consensus_.length() << endl;
2608                 assert(false);
2609             }
2610             if(range.second > orange.first &&
2611                range.second - orange.first < o.consensus_.length()) {
2612                 consensus_ += o.consensus_.substr(range.second - orange.first);
2613             }
2614         }
2615     }
2616 
2617     EList<pair<size_t, size_t> > merge_list;
2618     merge_list.reserveExact(o.seed_ranges_.size());
2619     size_t p = 0, p2 = 0;
2620     const size_t relax = 5;
2621     while(p < seed_ranges_.size() && p2 < o.seed_ranges_.size()) {
2622         const RB_AlleleCoord& range = seed_ranges_[p];
2623         assert_lt(range.idx, seeds_.size());
2624         const RB_AlleleCoord& range2 = o.seed_ranges_[p2];
2625         assert_lt(range2.idx, o.seeds_.size());
2626         if(range.contain(range2, relax)) {
2627             merge_list.expand();
2628             merge_list.back().first = p;
2629             merge_list.back().second = p2;
2630             if(debug) {
2631                 cerr << p << ":" << range.left << "-" << range.right << " > ";
2632                 cerr << p2 << ":" << range2.left << "-" << range2.right << endl;
2633             }
2634         } else if(range2.contain(range, relax)) {
2635             merge_list.expand();
2636             merge_list.back().first = p;
2637             merge_list.back().second = p2;
2638             if(debug) {
2639                 cerr << p << ":" << range.left << "-" << range.right << " < ";
2640                 cerr << p2 << ":" << range2.left << "-" << range2.right << endl;
2641             }
2642         } else {
2643             TIndexOffU overlap_len = range.overlap_len(range2);
2644             bool stored = !merge_list.empty() && merge_list.back().first == p;
2645             bool stored2 = !merge_list.empty() && merge_list.back().second == p2;
2646             if(overlap_len > 0) {
2647                 if(!stored && !stored2) {
2648                     merge_list.expand();
2649                     merge_list.back().first = p;
2650                     merge_list.back().second = p2;
2651 
2652                     if(debug) {
2653                         cerr << p << ":" << range.left << "-" << range.right << " ol ";
2654                         cerr << p2 << ":" << range2.left << "-" << range2.right << endl;
2655                     }
2656                 }
2657             } else {
2658                 if(range2.right <= range.left) {
2659                     if(!stored2) {
2660                         merge_list.expand();
2661                         merge_list.back().first = numeric_limits<size_t>::max();
2662                         merge_list.back().second = p2;
2663 
2664                         if(debug) {
2665                             cerr << p << ":" << range.left << "-" << range.right << " <> ";
2666                             cerr << p2 << ":" << range2.left << "-" << range2.right << endl;
2667                         }
2668                     }
2669                 } else {
2670                     assert_leq(range.right, range2.left);
2671                     if(!stored) {
2672                         merge_list.expand();
2673                         merge_list.back().first = p;
2674                         merge_list.back().second = numeric_limits<size_t>::max();
2675 
2676                         if(debug) {
2677                             cerr << p << ":" << range.left << "-" << range.right << " <> ";
2678                             cerr << p2 << ":" << range2.left << "-" << range2.right << endl;
2679                         }
2680                     }
2681                 }
2682             }
2683         }
2684         if(range.right <= range2.right) p++;
2685         if(range2.right <= range.right) p2++;
2686     }
2687     assert(p == seeds_.size() || p2 == o.seeds_.size());
2688 
2689     while(p < seed_ranges_.size()) {
2690         bool stored = !merge_list.empty() && merge_list.back().first == p;
2691         if(!stored) {
2692             const RB_AlleleCoord& range = seed_ranges_[p];
2693             merge_list.expand();
2694             merge_list.back().first = p;
2695             merge_list.back().second = numeric_limits<size_t>::max();
2696 
2697             if(debug) {
2698                 cerr << p << ":" << range.left << "-" << range.right << " : <> " << endl;
2699             }
2700         }
2701 
2702         p++;
2703     }
2704 
2705     while(p2 < o.seed_ranges_.size()) {
2706         bool stored2 = !merge_list.empty() && merge_list.back().second == p2;
2707         if(!stored2) {
2708             const RB_AlleleCoord& range2 = o.seed_ranges_[p2];
2709             merge_list.expand();
2710             merge_list.back().first = numeric_limits<size_t>::max();
2711             merge_list.back().second = p2;
2712 
2713             if(debug) {
2714                 cerr << ": <> " << p2 << ":" << range2.left << "-" << range2.right << endl;
2715             }
2716         }
2717 
2718         p2++;
2719     }
2720 
2721     assert(!merge_list.empty());
2722 
2723     if(debug) {
2724         for(size_t i = 0; i < merge_list.size(); i++) {
2725             cerr << "merge list:" << endl;
2726             cerr << "\t" << (merge_list[i].first < seed_ranges_.size() ? (int)merge_list[i].first : -1);
2727             cerr << " " << (merge_list[i].second < o.seed_ranges_.size() ? (int)merge_list[i].second : -1) << endl;
2728         }
2729     }
2730 
2731     const size_t kmer_len = 12;
2732     EList<pair<size_t, size_t> > kmer_table;
2733     build_kmer_table(consensus_, kmer_table, kmer_len);
2734     EList<int> offsets;
2735 
2736     bool self_repeat = isSelfRepeat(rp,
2737                                     consensus_,
2738                                     kmer_table,
2739                                     offsets,
2740                                     kmer_len,
2741                                     debug);
2742     if(self_repeat) {
2743         consensus_ = prev_consensus;
2744         return false;
2745     }
2746 
2747     string seq;
2748     for(size_t i = 0; i < merge_list.size(); i++) {
2749         size_t seed_id = (merge_list[i].first < seed_ranges_.size() ? seed_ranges_[merge_list[i].first].idx : merge_list[i].first);
2750         size_t oseed_id = (merge_list[i].second < o.seed_ranges_.size() ? o.seed_ranges_[merge_list[i].second].idx : merge_list[i].second);
2751 
2752         if(seed_id < seeds_.size()) {
2753             if(oseed_id >= o.seeds_.size())
2754                 continue;
2755             else if(seed_ranges_[merge_list[i].first].contain(o.seed_ranges_[merge_list[i].second]))
2756                 continue;
2757         }
2758 
2759         const SeedExt* seed = (seed_id < seeds_.size() ? &seeds_[seed_id] : NULL);
2760         const SeedExt* oseed = (oseed_id < o.seeds_.size() ? &o.seeds_[oseed_id] : NULL);
2761         assert(seed != NULL || oseed != NULL);
2762 
2763         size_t left = (seed != NULL ? seed->pos.first : numeric_limits<size_t>::max());
2764         size_t right = (seed != NULL ? seed->pos.second : 0);
2765         int consensus_approx_left = (seed != NULL ? seed->consensus_pos.first : numeric_limits<int>::max());
2766         int consensus_approx_right = (seed != NULL ? seed->consensus_pos.second : 0);
2767         if(oseed != NULL) {
2768             if(oseed->pos.first < left) {
2769                 left = oseed->pos.first;
2770             }
2771             if(oseed->pos.second > right) {
2772                 right = oseed->pos.second;
2773             }
2774 
2775             int oconsensus_approx_left = oseed->consensus_pos.first + consensus_add_len;
2776             if(oconsensus_approx_left < consensus_approx_left) {
2777                 consensus_approx_left = oconsensus_approx_left;
2778             }
2779             int oconsensus_approx_right = oseed->consensus_pos.second + consensus_add_len;
2780             if(oconsensus_approx_right > consensus_approx_right) {
2781                 consensus_approx_right = oconsensus_approx_right;
2782             }
2783         }
2784 
2785         getString(s, left, right - left, seq);
2786         if(seed_id >= seeds_.size()) {
2787             seed_id = seeds_.size();
2788             seeds_.expand();
2789         }
2790 
2791         /* bool succ = */ align(rp,
2792                                 s,
2793                                 consensus_,
2794                                 kmer_table,
2795                                 seq,
2796                                 offsets,
2797                                 kmer_len,
2798                                 seeds_[seed_id],
2799                                 consensus_approx_left,
2800                                 consensus_approx_right,
2801                                 left,
2802                                 right,
2803                                 debug && right - left == 1080);
2804     }
2805 
2806     while(true) {
2807         internal_update();
2808 
2809         size_t remove_count = 0;
2810         for(size_t i = 0; i + 1 < seed_ranges_.size(); i++) {
2811             RB_AlleleCoord& range = seed_ranges_[i];
2812             if(range.left == numeric_limits<size_t>::max())
2813                 break;
2814             for(size_t j = i + 1; j < seed_ranges_.size(); j++) {
2815                 RB_AlleleCoord& range2 = seed_ranges_[j];
2816                 if(range2.left == numeric_limits<size_t>::max())
2817                     break;
2818                 if(range.right <= range2.left)
2819                     break;
2820 
2821                 getString(s, range.left, range2.right - range.left, seq);
2822 
2823                 /* bool succ = */ align(rp,
2824                                         s,
2825                                         consensus_,
2826                                         kmer_table,
2827                                         seq,
2828                                         offsets,
2829                                         kmer_len,
2830                                         seeds_[range.idx],
2831                                         seeds_[range.idx].consensus_pos.first,
2832                                         seeds_[range2.idx].consensus_pos.second,
2833                                         range.left,
2834                                         range2.right,
2835                                         debug && range.left == 692422);
2836 
2837                 range.left = seeds_[range.idx].pos.first;
2838                 range.right = seeds_[range.idx].pos.second;
2839                 seeds_[range2.idx].reset();
2840                 range2.left = numeric_limits<size_t>::max();
2841                 remove_count++;
2842             }
2843         }
2844 
2845         if(remove_count <= 0) break;
2846 
2847         sort(seeds_.begin(), seeds_.end(), seedCmp);
2848         seeds_.resize(seeds_.size() - remove_count);
2849     }
2850 
2851     return true;
2852 }
2853 
2854 #define DMAX std::numeric_limits<double>::max()
2855 
RB_SWAligner()2856 RB_SWAligner::RB_SWAligner()
2857 {
2858     rnd_.init(0);
2859 }
2860 
~RB_SWAligner()2861 RB_SWAligner::~RB_SWAligner()
2862 {
2863     if(sc_) {
2864         delete sc_;
2865     }
2866 }
2867 
init_dyn(const RepeatParameter & rp)2868 void RB_SWAligner::init_dyn(const RepeatParameter& rp)
2869 {
2870     const int MM_PEN = 3;
2871     // const int MM_PEN = 6;
2872     const int GAP_PEN_LIN = 2;
2873     // const int GAP_PEN_LIN = (((MM_PEN) * rpt_edit_ + 1) * 1.0);
2874     const int GAP_PEN_CON = 4;
2875     // const int GAP_PEN_CON = (((MM_PEN) * rpt_edit_ + 1) * 1.0);
2876     const int MAX_PEN = MAX_I16;
2877 
2878     scoreMin_.init(SIMPLE_FUNC_LINEAR, rp.max_edit * MM_PEN * -1.0, 0.0);
2879     nCeil_.init(SIMPLE_FUNC_LINEAR, 0.0, 0.0);
2880 
2881     penCanIntronLen_.init(SIMPLE_FUNC_LOG, -8, 1);
2882     penNoncanIntronLen_.init(SIMPLE_FUNC_LOG, -8, 1);
2883 
2884     sc_ = new Scoring(
2885                       DEFAULT_MATCH_BONUS,     // constant reward for match
2886                       DEFAULT_MM_PENALTY_TYPE,     // how to penalize mismatches
2887                       MM_PEN,      // max mm penalty
2888                       MM_PEN,      // min mm penalty
2889                       MAX_PEN,      // max sc penalty
2890                       MAX_PEN,      // min sc penalty
2891                       scoreMin_,       // min score as function of read len
2892                       nCeil_,          // max # Ns as function of read len
2893                       DEFAULT_N_PENALTY_TYPE,      // how to penalize Ns in the read
2894                       DEFAULT_N_PENALTY,           // constant if N pelanty is a constant
2895                       DEFAULT_N_CAT_PAIR,    // whether to concat mates before N filtering
2896 
2897 
2898                       GAP_PEN_CON, // constant coeff for read gap cost
2899                       GAP_PEN_CON, // constant coeff for ref gap cost
2900                       GAP_PEN_LIN, // linear coeff for read gap cost
2901                       GAP_PEN_LIN, // linear coeff for ref gap cost
2902                       1 /* gGapBarrier */    // # rows at top/bot only entered diagonally
2903                       );
2904 }
2905 
makePadString(const string & ref,const string & read,string & pad,size_t len)2906 void RB_SWAligner::makePadString(const string& ref,
2907                                  const string& read,
2908                                  string& pad,
2909                                  size_t len)
2910 {
2911     pad.resize(len);
2912 
2913     for(size_t i = 0; i < len; i++) {
2914         // shift A->C, C->G, G->T, T->A
2915         pad[i] = "CGTA"[asc2dna[ref[i]]];
2916 
2917         if(read[i] == pad[i]) {
2918             // shift
2919             pad[i] = "CGTA"[asc2dna[pad[i]]];
2920         }
2921     }
2922 
2923     int head_len = len / 2;
2924     size_t pad_start = len - head_len;
2925 
2926     for(size_t i = 0; i < head_len; i++) {
2927         if(read[i] == pad[pad_start + i]) {
2928             // shift
2929             pad[pad_start + i] = "CGTA"[asc2dna[pad[pad_start + i]]];
2930         }
2931     }
2932 }
2933 
alignStrings(const string & ref,const string & read,EList<Edit> & edits,Coord & coord)2934 int RB_SWAligner::alignStrings(const string &ref,
2935                                const string &read,
2936                                EList<Edit>& edits,
2937                                Coord& coord)
2938 {
2939     // Prepare Strings
2940 
2941     // Read -> BTDnaString
2942     // Ref -> bit-encoded string
2943 
2944     //SwAligner swa;
2945 
2946     BTDnaString btread;
2947     BTString btqual;
2948     BTString btref;
2949     BTString btref2;
2950 
2951     BTDnaString btreadrc;
2952     BTString btqualrc;
2953 
2954 
2955     string qual = "";
2956     for(size_t i = 0; i < read.length(); i++) {
2957         qual.push_back('I');
2958     }
2959 
2960 #if 0
2961     cerr << "REF : " << ref << endl;
2962     cerr << "READ: " << read << endl;
2963     cerr << "QUAL: " << qual << endl;
2964 #endif
2965 
2966     btread.install(read.c_str(), true);
2967     btreadrc = btread;
2968     btreadrc.reverseComp();
2969 
2970     btqual.install(qual.c_str());
2971     btqualrc = btqual;
2972 
2973     btref.install(ref.c_str());
2974 
2975     TAlScore min_score = sc_->scoreMin.f<TAlScore >((double)btread.length());
2976 
2977     btref2 = btref;
2978 
2979     size_t nceil = 0;
2980     // size_t nrow = btread.length();
2981 
2982     // Convert reference string to mask
2983     for(size_t i = 0; i < btref2.length(); i++) {
2984         if(toupper(btref2[i]) == 'N') {
2985             btref2.set(16, i);
2986         } else {
2987             int num = 0;
2988             int alts[] = {4, 4, 4, 4};
2989             decodeNuc(toupper(btref2[i]), num, alts);
2990             assert_leq(num, 4);
2991             assert_gt(num, 0);
2992             btref2.set(0, i);
2993             for(int j = 0; j < num; j++) {
2994                 btref2.set(btref2[i] | (1 << alts[j]), i);
2995             }
2996         }
2997     }
2998 
2999 
3000     bool fw = true;
3001     uint32_t refidx = 0;
3002 
3003     swa.initRead(
3004                  btread,     // read sequence
3005                  btreadrc,
3006                  btqual,     // read qualities
3007                  btqualrc,
3008                  0,          // offset of first character within 'read' to consider
3009                  btread.length(), // offset of last char (exclusive) in 'read' to consider
3010                  *sc_);      // local-alignment score floor
3011 
3012     DynProgFramer dpframe(false);
3013     size_t readgaps = 0;
3014     size_t refgaps = 0;
3015     size_t maxhalf = 0;
3016 
3017     DPRect rect;
3018     dpframe.frameSeedExtensionRect(
3019                                    0,              // ref offset implied by seed hit assuming no gaps
3020                                    btread.length(),    // length of read sequence used in DP table
3021                                    btref2.length(),    // length of reference
3022                                    readgaps,   // max # of read gaps permitted in opp mate alignment
3023                                    refgaps,    // max # of ref gaps permitted in opp mate alignment
3024                                    (size_t)nceil,  // # Ns permitted
3025                                    maxhalf, // max width in either direction
3026                                    rect);      // DP Rectangle
3027 
3028     assert(rect.repOk());
3029 
3030     size_t cminlen = 2000, cpow2 = 4;
3031 
3032     swa.initRef(
3033                 fw,                 // whether to align forward or revcomp read
3034                 refidx,             // reference ID
3035                 rect,               // DP rectangle
3036                 btref2.wbuf(),      // reference strings
3037                 0,                  // offset of first reference char to align to
3038                 btref2.length(),    // offset of last reference char to align to
3039                 btref2.length(),    // length of reference sequence
3040                 *sc_,               // scoring scheme
3041                 min_score,          // minimum score
3042                 true,               // use 8-bit SSE if positions
3043                 cminlen,            // minimum length for using checkpointing scheme
3044                 cpow2,              // interval b/t checkpointed diags; 1 << this
3045                 false,              // triangular mini-fills?
3046                 false               // is this a seed extension?
3047                 );
3048 
3049 
3050     TAlScore best = std::numeric_limits<TAlScore>::min();
3051     bool found = swa.align(rnd_, best);
3052 
3053 #ifdef DEBUGLOG
3054     cerr << "found: " << found << "\t" << best << "\t" << "minsc: " << min_score << endl;
3055 #endif
3056 
3057     if (found) {
3058 #ifdef DEBUGLOG
3059         //cerr << "CP " << "found: " << found << "\t" << best << "\t" << "minsc: " << min_score << endl;
3060         cerr << "REF : " << ref << endl;
3061         cerr << "READ: " << read << endl;
3062 #endif
3063 
3064         SwResult res;
3065         int max_match_len = 0;
3066         res.reset();
3067         res.alres.init_raw_edits(&rawEdits_);
3068 
3069         found = swa.nextAlignment(res, best, rnd_);
3070         if (found) {
3071             edits = res.alres.ned();
3072             //const TRefOff ref_off = res.alres.refoff();
3073             //const Coord& coord = res.alres.refcoord();
3074             coord = res.alres.refcoord();
3075             //assert_geq(genomeHit._joinedOff + coord.off(), genomeHit.refoff());
3076 
3077 #ifdef DEBUGLOG
3078             cerr << "num edits: " << edits.size() << endl;
3079             cerr << "coord: " << coord.off();
3080             cerr << ", " << coord.ref();
3081             cerr << ", " << coord.orient();
3082             cerr << ", " << coord.joinedOff();
3083             cerr << endl;
3084             Edit::print(cerr, edits); cerr << endl;
3085             Edit::printQAlign(cerr, btread, edits);
3086 #endif
3087 
3088             max_match_len = getMaxMatchLen(edits, btread.length());
3089 #ifdef DEBUGLOG
3090             cerr << "max match length: " << max_match_len << endl;
3091 #endif
3092         }
3093 #ifdef DEBUGLOG
3094         cerr << "nextAlignment: " << found << endl;
3095         cerr << "-------------------------" << endl;
3096 #endif
3097     }
3098 
3099     return 0;
3100 }
3101 
doTest(const RepeatParameter & rp,const string & refstr,const string & readstr)3102 void RB_SWAligner::doTest(const RepeatParameter& rp,
3103                           const string& refstr,
3104                           const string& readstr)
3105 {
3106     init_dyn(rp);
3107 
3108     doTestCase1(refstr,
3109                 readstr,
3110                 rp.max_edit);
3111 }
3112 
doTestCase1(const string & refstr,const string & readstr,TIndexOffU rpt_edit)3113 void RB_SWAligner::doTestCase1(const string& refstr,
3114                                const string& readstr,
3115                                TIndexOffU rpt_edit)
3116 {
3117     cerr << "doTestCase1----------------" << endl;
3118     EList<Edit> edits;
3119     Coord coord;
3120 
3121     if (refstr.length() == 0 ||
3122         readstr.length() == 0) {
3123         return;
3124     }
3125 
3126     EList<Edit> ed;
3127 
3128     string pad;
3129     makePadString(refstr, readstr, pad, 5);
3130 
3131     string ref2 = pad + refstr + pad;
3132     string read2 = pad + readstr + pad;
3133     alignStrings(refstr, readstr, edits, coord);
3134 
3135     size_t left = pad.length();
3136     size_t right = left + readstr.length();
3137 
3138     edits.reserveExact(ed.size());
3139     for(size_t i = 0; i < ed.size(); i++) {
3140         if(ed[i].pos >= left && ed[i].pos <= right) {
3141             edits.push_back(ed[i]);
3142             edits.back().pos -= left;
3143         }
3144     }
3145 
3146 
3147 #if 0
3148     RepeatGroup rg;
3149 
3150     rg.edits = edits;
3151     rg.coord = coord;
3152     rg.seq = readstr;
3153     rg.base_offset = 0;
3154 
3155     string chr_name = "rep";
3156 
3157     cerr << "REF : " << refstr << endl;
3158     cerr << "READ: " << readstr << endl;
3159     size_t snpids = 0;
3160     rg.buildSNPs(snpids);
3161     rg.writeSNPs(cerr, chr_name); cerr << endl;
3162 #endif
3163 }
3164 
3165 
3166 template<typename TStr>
RepeatBuilder(TStr & s,const EList<RefRecord> & szs,const EList<string> & ref_names,bool forward_only,const string & filename)3167 RepeatBuilder<TStr>::RepeatBuilder(TStr& s,
3168                                    const EList<RefRecord>& szs,
3169                                    const EList<string>& ref_names,
3170                                    bool forward_only,
3171                                    const string& filename) :
3172 s_(s),
3173 coordHelper_(s.length(), forward_only ? s.length() : s.length() / 2, szs, ref_names),
3174 forward_only_(forward_only),
3175 filename_(filename),
3176 forward_length_(forward_only ? s.length() : s.length() / 2)
3177 {
3178 	cerr << "RepeatBuilder: " << filename_ << endl;
3179 }
3180 
3181 template<typename TStr>
~RepeatBuilder()3182 RepeatBuilder<TStr>::~RepeatBuilder()
3183 {
3184     for(map<size_t, RB_Repeat*>::iterator it = repeat_map_.begin(); it != repeat_map_.end(); it++) {
3185         delete it->second;
3186     }
3187     repeat_map_.clear();
3188 }
3189 
3190 template<typename TStr>
readSA(const RepeatParameter & rp,BlockwiseSA<TStr> & sa)3191 void RepeatBuilder<TStr>::readSA(const RepeatParameter& rp,
3192                                  BlockwiseSA<TStr>& sa)
3193 {
3194     TIndexOffU count = 0;
3195     subSA_.init(s_.length() + 1, rp.min_repeat_len, rp.repeat_count);
3196 
3197     while(count < s_.length() + 1) {
3198         TIndexOffU saElt = sa.nextSuffix();
3199         count++;
3200 
3201         if(count && (count % 10000000 == 0)) {
3202             cerr << "SA count " << count << endl;
3203         }
3204 
3205         if(saElt == s_.length()) {
3206             assert_eq(count, s_.length() + 1);
3207             break;
3208         }
3209 
3210         subSA_.push_back(s_, coordHelper_, saElt, count == s_.length());
3211     }
3212 
3213     cerr << "subSA size is " << subSA_.size() << endl;
3214     subSA_.dump();
3215 #if 0
3216     for(size_t i = 0; i < subSA_.size(); i++) {
3217         TIndexOffU joinedOff = subSA_[i];
3218         fp << setw(10) << joinedOff << " " << getString(s_, joinedOff, rp.seed_len) << endl;
3219     }
3220 #endif
3221     cerr << "subSA mem Usage: " << subSA_.getMemUsage() << endl << endl;
3222 }
3223 
3224 template<typename TStr>
readSA(const RepeatParameter & rp,const BitPackedArray & sa)3225 void RepeatBuilder<TStr>::readSA(const RepeatParameter &rp,
3226                                  const BitPackedArray &sa)
3227 {
3228     TIndexOffU count = 0;
3229 
3230     subSA_.init(s_.length() + 1, rp.min_repeat_len, rp.repeat_count);
3231 
3232     for(size_t i = 0; i < sa.size(); i++) {
3233         TIndexOffU saElt = sa[i];
3234         count++;
3235 
3236         if(count && (count % 10000000 == 0)) {
3237             cerr << "RB count " << count << endl;
3238         }
3239 
3240         if(saElt == s_.length()) {
3241             assert_eq(count, s_.length() + 1);
3242             break;
3243         }
3244 
3245         subSA_.push_back(s_, coordHelper_, saElt, count == s_.length());
3246     }
3247 
3248     cerr << "subSA size: " << endl;
3249     subSA_.dump();
3250 #if 0
3251     for(size_t i = 0; i < subSA_.size(); i++) {
3252         TIndexOffU joinedOff = subSA_[i];
3253         fp << setw(10) << joinedOff << " " << getString(s_, joinedOff, rp.seed_len) << endl;
3254     }
3255 #endif
3256     cerr << "subSA mem Usage: " << subSA_.getMemUsage() << endl << endl;
3257 }
3258 
3259 template<typename TStr>
build(const RepeatParameter & rp)3260 void RepeatBuilder<TStr>::build(const RepeatParameter& rp)
3261 {
3262     string rpt_len_str;
3263     rpt_len_str = to_string(rp.min_repeat_len) + "-" + to_string(rp.max_repeat_len);
3264 
3265     string seed_filename = filename_ + ".rep." + rpt_len_str + ".seed";
3266     ofstream fp(seed_filename.c_str());
3267 
3268     swaligner_.init_dyn(rp);
3269 
3270     RB_RepeatManager* repeat_manager = new RB_RepeatManager;
3271 
3272     EList<RB_RepeatBase> repeatBases;
3273     subSA_.buildRepeatBase(s_,
3274                            coordHelper_,
3275                            rp.max_repeat_len,
3276                            repeatBases);
3277 
3278     size_t repeat_id = 0;
3279     for(size_t i = 0; i < repeatBases.size(); i++) {
3280         addRepeatGroup(rp,
3281                        repeat_id,
3282                        *repeat_manager,
3283                        repeatBases[i],
3284                        fp);
3285     }
3286 
3287     {
3288         // Build and test minimizer-based k-mer table
3289         const size_t window = RB_Minimizer<TStr>::default_w;
3290         const size_t k = RB_Minimizer<TStr>::default_k;
3291         RB_KmerTable kmer_table;
3292         EList<string> seqs;
3293         seqs.reserveExact(repeat_map_.size());
3294         for(map<size_t, RB_Repeat*>::const_iterator it = repeat_map_.begin(); it != repeat_map_.end(); it++) {
3295             const RB_Repeat& repeat = *(it->second);
3296             assert(repeat.satisfy(rp));
3297             seqs.expand();
3298             seqs.back() = repeat.consensus();
3299         }
3300         kmer_table.build(seqs,
3301                          window,
3302                          k);
3303         kmer_table.dump(cerr);
3304         cerr << endl;
3305 
3306         string query, rc_query;
3307         EList<pair<uint64_t, size_t> > minimizers;
3308         size_t total = 0, num_repeat = 0, correct = 0, false_positive = 0, false_negative = 0;
3309         for(size_t i = 0; i + rp.min_repeat_len <= forward_length_; i += 1000) {
3310             if(coordHelper_.getEnd(i) != coordHelper_.getEnd(i + rp.min_repeat_len))
3311                 continue;
3312             query = getString(s_, i, rp.min_repeat_len);
3313             rc_query = reverseComplement(query);
3314 
3315             TIndexOffU idx = subSA_.find_repeat_idx(s_, query);
3316             const EList<TIndexOffU>& test_repeat_index = subSA_.getRepeatIndex();
3317             bool repeat = (idx < test_repeat_index.size());
3318             bool est_repeat = kmer_table.isRepeat(query,
3319                                                   rc_query,
3320                                                   minimizers);
3321 
3322             total++;
3323             if(repeat) num_repeat++;
3324             if(repeat == est_repeat) {
3325                 correct++;
3326             } else {
3327                 if(est_repeat) {
3328                     false_positive++;
3329                 } else {
3330                     false_negative++;
3331                     assert(false);
3332                 }
3333             }
3334         }
3335 
3336         cerr << "total: " << total << endl;
3337         cerr << "repeat: " << num_repeat << endl;
3338         cerr << "correct: " << correct << endl;
3339         cerr << "false positive: " << false_positive << endl;
3340         cerr << "false negative: " << false_negative << endl;
3341         cerr << endl;
3342 
3343         ELList<RB_Alignment> position2D; EList<RB_Alignment> alignments;
3344         size_t repeat_total = 0, repeat_aligned = 0;
3345         const EList<TIndexOffU>& test_repeat_index = subSA_.getRepeatIndex();
3346         size_t interval = 1;
3347         if(test_repeat_index.size() >= 1000) {
3348             interval = test_repeat_index.size() / 1000;
3349         }
3350         size_t total_alignments = 0, max_alignments = 0;
3351         string query2, rc_query2;
3352         for(size_t i = 0; i < test_repeat_index.size(); i += interval) {
3353             TIndexOffU saElt_idx = test_repeat_index[i];
3354             TIndexOffU saElt_idx_end = (i + 1 < test_repeat_index.size() ? test_repeat_index[i+1] : subSA_.size());
3355             TIndexOffU saElt_size = saElt_idx_end - saElt_idx;
3356             TIndexOffU saElt = subSA_[saElt_idx];
3357             query = getString(s_, saElt, rp.min_repeat_len);
3358             query2 = query;
3359 
3360             // introduce three mismatches into a query when the query is at lest 100-bp
3361             if(rp.min_repeat_len >= 100) {
3362                 const size_t mid_pos1 = (size_t)(rp.min_repeat_len * 0.1);
3363                 if(query2[mid_pos1] == 'A') {
3364                     query2[mid_pos1] = 'C';
3365                 } else {
3366                     query2[mid_pos1] = 'A';
3367                 }
3368                 const size_t mid_pos2 = (size_t)(rp.min_repeat_len * 0.5);
3369                 if(query2[mid_pos2] == 'C') {
3370                     query2[mid_pos2] = 'G';
3371                 } else {
3372                     query2[mid_pos2] = 'C';
3373                 }
3374                 const size_t mid_pos3 = (size_t)(rp.min_repeat_len * 0.9);
3375                 if(query2[mid_pos3] == 'G') {
3376                     query2[mid_pos3] = 'T';
3377                 } else {
3378                     query2[mid_pos3] = 'G';
3379                 }
3380             }
3381 
3382             repeat_total += saElt_size;
3383 
3384             size_t found = 0;
3385             size_t cur_alignments = 0;
3386             if(kmer_table.isRepeat(query2, minimizers)) {
3387                 kmer_table.findAlignments(query2,
3388                                           minimizers,
3389                                           position2D,
3390                                           alignments);
3391                 total_alignments += (alignments.size() * saElt_size);
3392                 cur_alignments = alignments.size();
3393                 TIndexOffU baseoff = 0;
3394                 for(size_t s = 0; s < seqs.size(); s++) {
3395                     int spos = seqs[s].find(query);
3396                     if(spos != string::npos) {
3397                         for(size_t a = 0; a < alignments.size(); a++) {
3398                             if(alignments[a].pos == baseoff + spos) {
3399                                 found++;
3400                             }
3401                         }
3402                     }
3403                     baseoff += seqs[s].length();
3404                 }
3405 
3406                 assert_leq(found, 1);
3407             }
3408 
3409             rc_query = reverseComplement(query);
3410             rc_query2 = reverseComplement(query2);
3411             size_t rc_found = 0;
3412             if(kmer_table.isRepeat(rc_query2, minimizers)) {
3413                 kmer_table.findAlignments(rc_query2,
3414                                           minimizers,
3415                                           position2D,
3416                                           alignments);
3417                 total_alignments += (alignments.size() * saElt_size);
3418                 cur_alignments += alignments.size();
3419                 if(cur_alignments > max_alignments) {
3420                     max_alignments = cur_alignments;
3421                 }
3422 
3423                 TIndexOffU baseoff = 0;
3424                 for(size_t s = 0; s < seqs.size(); s++) {
3425                     int spos = seqs[s].find(rc_query);
3426                     if(spos != string::npos) {
3427                         for(size_t a = 0; a < alignments.size(); a++) {
3428                             if(alignments[a].pos == baseoff + spos) {
3429                                 rc_found++;
3430                             }
3431                         }
3432                     }
3433                     baseoff += seqs[s].length();
3434                 }
3435                 assert_leq(rc_found, 1);
3436             }
3437 
3438             if(found + rc_found == 1) {
3439                 repeat_aligned += saElt_size;
3440             }
3441         }
3442 
3443         cerr << "num repeats: " << repeat_total << endl;
3444         cerr << "repeat aligned using minimizers: " << repeat_aligned << endl;
3445         cerr << "average number of alignments: " << (float)total_alignments / repeat_total << endl;
3446         cerr << "max alignment: " << max_alignments << endl;
3447         cerr << endl;
3448     }
3449 
3450     cerr << "number of repeats is " << repeat_map_.size() << endl;
3451 
3452     size_t total_rep_seq_len = 0, total_allele_seq_len = 0;
3453     size_t i = 0;
3454     for(map<size_t, RB_Repeat*>::iterator it = repeat_map_.begin(); it != repeat_map_.end(); it++, i++) {
3455         RB_Repeat& repeat = *(it->second);
3456         if(!repeat.satisfy(rp))
3457             continue;
3458 
3459         repeat.saveSeedExtension(rp,
3460                                  s_,
3461                                  coordHelper_,
3462                                  i,
3463                                  fp,
3464                                  total_rep_seq_len,
3465                                  total_allele_seq_len);
3466     }
3467 
3468     size_t total_qual_seeds = 0;
3469     for(map<size_t, RB_Repeat*>::iterator it = repeat_map_.begin(); it != repeat_map_.end(); it++, i++) {
3470         RB_Repeat& repeat = *(it->second);
3471         EList<SeedExt>& seeds = repeat.seeds();
3472         for(size_t i = 0; i < seeds.size(); i++) {
3473             if(seeds[i].getLength() < rp.min_repeat_len)
3474                 continue;
3475             total_qual_seeds++;
3476         }
3477     }
3478 
3479     cerr << "total repeat sequence length: " << total_rep_seq_len << endl;
3480     cerr << "total allele sequence length: " << total_allele_seq_len << endl;
3481     cerr << "total number of seeds including those that are position-wise different, but sequence-wise identical: " << total_qual_seeds << endl;
3482     cerr << endl;
3483     fp << "total repeat sequence length: " << total_rep_seq_len << endl;
3484     fp << "total allele sequence length: " << total_allele_seq_len << endl;
3485     fp << "total number of seeds including those that are position-wise different, but sequence-wise identical: " << total_qual_seeds << endl;
3486     fp.close();
3487 
3488     delete repeat_manager;
3489     repeat_manager = NULL;
3490 
3491     // remove non-qualifying repeats
3492     //  and update repeat IDs for those remaining
3493     {
3494         map<size_t, RB_Repeat*> temp_repeat_map;
3495         size_t i = 0;
3496         for(map<size_t, RB_Repeat*>::iterator it = repeat_map_.begin(); it != repeat_map_.end(); it++) {
3497             RB_Repeat* repeat = it->second;
3498             if(!repeat->satisfy(rp)) {
3499                 delete repeat;
3500                 continue;
3501             }
3502             repeat->repeat_id(i);
3503             temp_repeat_map[repeat->repeat_id()] = repeat;
3504             i++;
3505         }
3506         repeat_map_ = temp_repeat_map;
3507     }
3508 
3509     const bool sanity_check = true;
3510     if(sanity_check) {
3511         EList<pair<size_t, size_t> > kmer_table;
3512         string query;
3513         size_t total = 0, match = 0;
3514         EList<TIndexOffU> positions;
3515         const EList<TIndexOffU>& test_repeat_index = subSA_.getRepeatIndex();
3516         size_t interval = 1;
3517         if(test_repeat_index.size() >= 10000) {
3518             interval = test_repeat_index.size() / 10000;
3519         }
3520         for(size_t i = 0; i < test_repeat_index.size(); i += interval) {
3521             TIndexOffU saElt_idx = test_repeat_index[i];
3522             TIndexOffU saElt_idx_end = (i + 1 < test_repeat_index.size() ? test_repeat_index[i+1] : subSA_.size());
3523             positions.clear();
3524             for(size_t j = saElt_idx; j < saElt_idx_end; j++) {
3525                 positions.push_back(subSA_[j]);
3526 #ifndef NDEBUG
3527                 if(j > saElt_idx) {
3528                     TIndexOffU lcp_len = getLCP(s_,
3529                                                 coordHelper_,
3530                                                 positions[0],
3531                                                 positions.back(),
3532                                                 rp.min_repeat_len);
3533                     assert_eq(lcp_len, rp.min_repeat_len);
3534                 }
3535 
3536                 TIndexOffU saElt = subSA_[j];
3537                 TIndexOffU start = coordHelper_.getStart(saElt);
3538                 TIndexOffU start2 = coordHelper_.getStart(saElt + rp.min_repeat_len - 1);
3539                 assert_eq(start, start2);
3540 #endif
3541             }
3542 
3543             TIndexOffU saElt = subSA_[saElt_idx];
3544             size_t true_count = saElt_idx_end - saElt_idx;
3545             getString(s_, saElt, rp.min_repeat_len, query);
3546             total++;
3547 
3548             size_t count = 0, rc_count = 0;
3549             string rc_query = reverse_complement(query);
3550             for(map<size_t, RB_Repeat*>::iterator it = repeat_map_.begin(); it != repeat_map_.end(); it++) {
3551                 RB_Repeat& repeat = *(it->second);
3552                 int pos = repeat.consensus().find(query);
3553                 if(pos != string::npos) {
3554                     for(size_t s = 0; s < repeat.seeds().size(); s++) {
3555                         SeedExt& seed = repeat.seeds()[s];
3556                         string seq;
3557                         seed.getExtendedSeedSequence(s_, seq);
3558                         if(seq.find(query) != string::npos)
3559                             count++;
3560                     }
3561                 }
3562                 pos = repeat.consensus().find(rc_query);
3563                 if(pos != string::npos) {
3564                     for(size_t s = 0; s < repeat.seeds().size(); s++) {
3565                         SeedExt& seed = repeat.seeds()[s];
3566                         string seq;
3567                         seed.getExtendedSeedSequence(s_, seq);
3568                         if(seq.find(rc_query) != string::npos)
3569                             rc_count++;
3570                     }
3571                 }
3572             }
3573 
3574             if(count == true_count || rc_count == true_count) {
3575                 match++;
3576             } else if(total - match <= 10) {
3577                 cerr << "   query: " << query << endl;
3578                 cerr << "rc_query: " << rc_query << endl;
3579                 cerr << "true count: " << true_count << endl;
3580                 cerr << "found count: " << count << endl;
3581                 cerr << "rc found count: " << rc_count << endl;
3582                 cerr << endl;
3583             }
3584         }
3585 
3586         cerr << "RepeatBuilder: sanity check: " << match << " passed (out of " << total << ")" << endl << endl;
3587     }
3588 }
3589 
3590 template<typename TStr>
writeHaploType(const EList<SeedHP> & haplo_lists,const EList<SeedExt> & seeds,TIndexOffU & hapl_id_base,const string & seq_name,ostream & fp)3591 void RepeatBuilder<TStr>::writeHaploType(const EList<SeedHP>& haplo_lists,
3592                                          const EList<SeedExt>& seeds,
3593                                          TIndexOffU& hapl_id_base,
3594                                          const string& seq_name,
3595                                          ostream &fp)
3596 {
3597     if(haplo_lists.size() == 0) {
3598         return;
3599     }
3600 
3601     for(size_t i = 0; i < haplo_lists.size(); i++) {
3602         const SeedHP &haploType = haplo_lists[i];
3603 
3604         fp << "rpht" << hapl_id_base++;
3605         fp << "\t" << seq_name;
3606         fp << "\t" << haploType.range.first;
3607         fp << "\t" << haploType.range.second;
3608         fp << "\t";
3609 
3610         assert_gt(haploType.snpIDs.size(), 0);
3611 
3612         for (size_t j = 0; j < haploType.snpIDs.size(); j++) {
3613             if(j) {
3614                 fp << ",";
3615             }
3616             fp << haploType.snpIDs[j];
3617         }
3618         fp << endl;
3619     }
3620 }
3621 
3622 template<typename TStr>
writeAllele(TIndexOffU grp_id,TIndexOffU allele_id,Range range,const string & seq_name,TIndexOffU baseoff,const EList<SeedExt> & seeds,ostream & fp)3623 void RepeatBuilder<TStr>::writeAllele(TIndexOffU grp_id,
3624                                       TIndexOffU allele_id,
3625                                       Range range,
3626                                       const string& seq_name,
3627                                       TIndexOffU baseoff,
3628                                       const EList<SeedExt>& seeds,
3629                                       ostream &fp)
3630 {
3631     // >rpt_name*0\trep\trep_pos\trep_len\tpos_count\t0
3632     // chr_name:pos:direction chr_name:pos:direction
3633     //
3634     // >rep1*0	rep	0	100	470	0
3635     // 22:112123123:+ 22:1232131113:+
3636     //
3637     size_t snp_size = seeds[range.first].snps.size();
3638     size_t pos_size = range.second - range.first;
3639     fp << ">";
3640     fp << "rpt_" << grp_id << "*" << allele_id;
3641     fp << "\t" << seq_name;
3642     fp << "\t" << baseoff + seeds[range.first].consensus_pos.first;
3643     fp << "\t" << seeds[range.first].consensus_pos.second - seeds[range.first].consensus_pos.first;
3644     fp << "\t" << pos_size;
3645     fp << "\t" << snp_size;
3646 
3647     fp << "\t";
3648     for(size_t i = 0; i < snp_size; i++) {
3649         if(i > 0) {fp << ",";}
3650         fp << "rps" << seeds[range.first].snps[i]->id;
3651     }
3652     fp << endl;
3653 
3654     // print positions
3655     for(size_t i = 0; i < pos_size; i++) {
3656         if(i > 0 && (i % 10 == 0)) {
3657             fp << endl;
3658         }
3659 
3660         if(i % 10 != 0) {
3661             fp << " ";
3662         }
3663         string chr_name;
3664         TIndexOffU pos_in_chr;
3665         TIndexOffU joinedOff = seeds[range.first + i].pos.first;
3666 
3667         bool fw = true;
3668         if(joinedOff < forward_length_) {
3669             fw = true;
3670         } else {
3671             fw = false;
3672             joinedOff = s_.length() - joinedOff - seeds[range.first].getLength();
3673         }
3674 
3675         coordHelper_.getGenomeCoord(joinedOff, chr_name, pos_in_chr);
3676 
3677         char direction = fw ? '+' : '-';
3678         fp << chr_name << ":" << pos_in_chr << ":" << direction;
3679     }
3680 
3681     fp << endl;
3682 }
3683 
3684 template<typename TStr>
writeSNPs(ostream & fp,const string & rep_chr_name,const ESet<Edit> & editset)3685 void RepeatBuilder<TStr>::writeSNPs(ostream& fp,
3686                                     const string& rep_chr_name,
3687                                     const ESet<Edit>& editset)
3688 {
3689     for(size_t i = 0; i < editset.size(); i++) {
3690         const Edit& ed = editset[i];
3691 
3692         if(ed.isMismatch()) {
3693             fp << "rps" << ed.snpID;
3694             fp << "\t" << "single";
3695             fp << "\t" << rep_chr_name;
3696             fp << "\t" << ed.pos;
3697             fp << "\t" << ed.qchr;
3698             fp << endl;
3699         } else {
3700             assert(false);
3701         }
3702     }
3703 }
3704 
3705 
3706 template<typename TStr>
saveFile(const RepeatParameter & rp)3707 void RepeatBuilder<TStr>::saveFile(const RepeatParameter& rp)
3708 {
3709     saveRepeats(rp);
3710 }
3711 
checkRedundant(const RepeatParameter & rp,const map<size_t,RB_Repeat * > & repeat_map,const EList<TIndexOffU> & positions,EList<size_t> & to_remove) const3712 bool RB_RepeatManager::checkRedundant(const RepeatParameter& rp,
3713                                       const map<size_t, RB_Repeat*>& repeat_map,
3714                                       const EList<TIndexOffU>& positions,
3715                                       EList<size_t>& to_remove) const
3716 {
3717     to_remove.clear();
3718     bool replace = false;
3719     for(size_t i = 0; i < positions.size(); i++) {
3720         TIndexOffU seed_pos = positions[i];
3721         Range seed_range(seed_pos + rp.seed_len, 0);
3722         map<Range, EList<size_t> >::const_iterator it = range_to_repeats_.upper_bound(seed_range);
3723         if(it == range_to_repeats_.end())
3724             continue;
3725 
3726         for(map<Range, EList<size_t> >::const_reverse_iterator rit(it); rit != range_to_repeats_.rend(); rit++) {
3727             Range repeat_range = rit->first;
3728             if(repeat_range.first + rp.max_repeat_len <= seed_pos)
3729                 break;
3730 
3731             const EList<size_t>& repeat_ids = rit->second;
3732             assert_gt(repeat_ids.size(), 0);
3733             for(size_t r = 0; r < repeat_ids.size(); r++) {
3734                 size_t repeat_id = repeat_ids[r];
3735                 size_t idx = to_remove.bsearchLoBound(repeat_id);
3736                 if(idx < to_remove.size() && to_remove[idx] == repeat_id)
3737                     continue;
3738 
3739                 bool overlap = seed_pos < repeat_range.second && seed_pos + rp.seed_len > repeat_range.first;
3740                 if(!overlap)
3741                     continue;
3742 
3743                 map<size_t, RB_Repeat*>::const_iterator it2 = repeat_map.find(repeat_id);
3744                 assert(it2 != repeat_map.end());
3745                 const RB_Repeat& repeat = *(it2->second);
3746                 const EList<RB_AlleleCoord>& allele_ranges = repeat.seed_ranges();
3747                 size_t num_contain = 0, num_overlap = 0, num_close = 0, num_overlap_bp = 0;
3748                 size_t p = 0, p2 = 0;
3749                 while(p < positions.size() && p2 < allele_ranges.size()) {
3750                     RB_AlleleCoord range;
3751                     range.left = positions[p];
3752                     range.right = positions[p] + rp.seed_len;
3753                     RB_AlleleCoord range2 = allele_ranges[p2];
3754                     if(range2.contain(range)) {
3755                         num_contain++;
3756                         num_overlap_bp += rp.seed_len;
3757                     } else {
3758                         TIndexOffU overlap = range2.overlap_len(range);
3759                         if(overlap > 0) {
3760                             num_overlap++;
3761                             num_overlap_bp += (range2.right - range.left);
3762                         } else if(range.right + 10 > range2.left && range2.right + 10 > range.left) {
3763                             num_close++;
3764                         }
3765                     }
3766                     if(range.right <= range2.right) p++;
3767                     else                            p2++;
3768                 }
3769 
3770                 // if the number of matches is >= 90% of positions in the smaller group
3771                 if((num_contain + num_overlap) * 10 + num_close * 8 >= min(positions.size(), allele_ranges.size()) * 9) {
3772                     if(positions.size() <= allele_ranges.size()) {
3773                         return true;
3774                     } else {
3775                         replace = true;
3776                         to_remove.push_back(repeat_id);
3777                         to_remove.sort();
3778                     }
3779                 }
3780             }
3781         }
3782 
3783         // DK - check this out
3784         if(replace)
3785             break;
3786     }
3787     return false;
3788 }
3789 
addRepeat(const RB_Repeat * repeat)3790 void RB_RepeatManager::addRepeat(const RB_Repeat* repeat)
3791 {
3792     const EList<RB_AlleleCoord>& allele_ranges = repeat->seed_ranges();
3793     for(size_t i = 0; i < allele_ranges.size(); i++) {
3794         Range allele_range(allele_ranges[i].left, allele_ranges[i].right);
3795         addRepeat(allele_range, repeat->repeat_id());
3796     }
3797 }
3798 
addRepeat(Range range,size_t repeat_id)3799 void RB_RepeatManager::addRepeat(Range range, size_t repeat_id)
3800 {
3801     if(range_to_repeats_.find(range) == range_to_repeats_.end()) {
3802         range_to_repeats_[range] = EList<size_t>();
3803     }
3804     EList<size_t>& repeat_ids = range_to_repeats_[range];
3805     size_t idx = repeat_ids.bsearchLoBound(repeat_id);
3806     if(idx < repeat_ids.size() && repeat_ids[idx] == repeat_id)
3807         return;
3808     repeat_ids.push_back(repeat_id);
3809     repeat_ids.sort();
3810 }
3811 
removeRepeat(const RB_Repeat * repeat)3812 void RB_RepeatManager::removeRepeat(const RB_Repeat* repeat)
3813 {
3814     const EList<RB_AlleleCoord>& allele_ranges = repeat->seed_ranges();
3815     for(size_t p = 0; p < allele_ranges.size(); p++) {
3816         Range range(allele_ranges[p].left, allele_ranges[p].right);
3817         removeRepeat(range, repeat->repeat_id());
3818     }
3819 }
3820 
removeRepeat(Range range,size_t repeat_id)3821 void RB_RepeatManager::removeRepeat(Range range, size_t repeat_id)
3822 {
3823     EList<size_t>& repeat_ids = range_to_repeats_[range];
3824     TIndexOffU idx = repeat_ids.bsearchLoBound(repeat_id);
3825     if(idx < repeat_ids.size() && repeat_id == repeat_ids[idx]) {
3826         repeat_ids.erase(idx);
3827         if(repeat_ids.empty())
3828             range_to_repeats_.erase(range);
3829     }
3830 }
3831 
showInfo(const RepeatParameter & rp,CoordHelper & coordHelper,const map<size_t,RB_Repeat * > & repeat_map,size_t num_case) const3832 void RB_RepeatManager::showInfo(const RepeatParameter& rp,
3833                                 CoordHelper& coordHelper,
3834                                 const map<size_t, RB_Repeat*>& repeat_map,
3835                                 size_t num_case) const
3836 {
3837     size_t count = 0;
3838     for(map<Range, EList<size_t> >::const_iterator it = range_to_repeats_.begin();
3839         it != range_to_repeats_.end(); it++) {
3840         const Range& range = it->first;
3841         if(range.second - range.first < rp.min_repeat_len) continue;
3842         map<Range, EList<size_t> >::const_iterator jt = it; jt++;
3843         for(; jt != range_to_repeats_.end(); jt++) {
3844             const Range& range2 = jt->first;
3845             if(range2.second - range2.first < rp.min_repeat_len) continue;
3846             if(range.second <= range2.first)
3847                 break;
3848             if(count < num_case) {
3849                 cerr << "range (" << range.first << ", " << range.second << ") vs. range2 (";
3850                 cerr << range2.first << ", " << range2.second << ")" << endl;
3851                 for(size_t i = 0; i < it->second.size(); i++) {
3852                     cerr << "\t1 " << it->second[i] << endl;
3853                     const RB_Repeat* repeat = repeat_map.find(it->second[i])->second;
3854                     repeat->showInfo(rp, coordHelper);
3855                 }
3856                 for(size_t i = 0; i < jt->second.size(); i++) {
3857                     cerr << "\t2 " << jt->second[i] << endl;
3858                     const RB_Repeat* repeat = repeat_map.find(jt->second[i])->second;
3859                     repeat->showInfo(rp, coordHelper);
3860                 }
3861                 cerr << endl << endl;
3862             }
3863             count++;
3864         }
3865     }
3866     cerr << "ShowInfo - count: " << count << endl;
3867 }
3868 
3869 template<typename TStr>
reassignSeeds(const RepeatParameter & rp,size_t repeat_bid,size_t repeat_eid,EList<SeedExt> & seeds)3870 void RepeatBuilder<TStr>::reassignSeeds(const RepeatParameter& rp,
3871                                         size_t repeat_bid, // repeat begin id
3872                                         size_t repeat_eid, // repeat end id
3873                                         EList<SeedExt>& seeds)
3874 {
3875     assert_lt(repeat_bid, repeat_eid);
3876     EList<bool> updated;
3877     updated.resizeExact(repeat_eid - repeat_bid);
3878     updated.fillZero();
3879 
3880     string seq;
3881     for(size_t s = 0; s < seeds.size(); s++) {
3882         SeedExt& seed = seeds[s];
3883         size_t max_repeat_id = repeat_bid;
3884         size_t max_ext_len = 0;
3885         for(size_t i = repeat_bid; i < repeat_eid; i++) {
3886             map<size_t, RB_Repeat*>::iterator it = repeat_map_.find(i);
3887             assert(it != repeat_map_.end());
3888             const RB_Repeat& repeat = *(it->second);
3889             const string& consensus = repeat.consensus();
3890             const size_t ext_len = (consensus.length() - rp.seed_len) / 2;
3891 
3892             if(seed.bound.first + ext_len > seed.pos.first)
3893                 continue;
3894             if(seed.pos.second + ext_len > seed.bound.second)
3895                 continue;
3896 
3897             getString(s_, seed.pos.first - ext_len, rp.seed_len + ext_len * 2, seq);
3898             assert_eq(consensus.length(), seq.length());
3899             size_t tmp_ext_len = 0;
3900             for(size_t j = 0; j < ext_len; j++, tmp_ext_len++) {
3901                 if(seq[ext_len - j - 1] != consensus[ext_len - j - 1] ||
3902                    seq[ext_len + rp.seed_len + j] != consensus[ext_len + rp.seed_len + j]) {
3903                     break;
3904                 }
3905             }
3906 
3907             if(tmp_ext_len > max_ext_len) {
3908                 max_repeat_id = i;
3909                 max_ext_len = tmp_ext_len;
3910             }
3911         }
3912         if(rp.seed_len + max_ext_len * 2 >= rp.min_repeat_len) {
3913             seed.pos.first -= max_ext_len;
3914             seed.pos.second += max_ext_len;
3915             map<size_t, RB_Repeat*>::iterator it = repeat_map_.find(max_repeat_id);
3916             assert(it != repeat_map_.end());
3917             RB_Repeat& repeat = *(it->second);
3918             const string& consensus = repeat.consensus();
3919             const size_t ext_len = (consensus.length() - rp.seed_len) / 2;
3920             assert_leq(max_ext_len, ext_len);
3921             seed.consensus_pos.first = ext_len - max_ext_len;
3922             seed.consensus_pos.second = consensus.length() - (ext_len - max_ext_len);
3923             assert_leq(seed.consensus_pos.second - seed.consensus_pos.first, consensus.length());
3924             repeat.addSeed(seed);
3925             updated[max_repeat_id - repeat_bid] = true;
3926         }
3927     }
3928 
3929     for(size_t i = repeat_bid; i < repeat_eid; i++) {
3930         if(!updated[i - repeat_bid])
3931             continue;
3932         map<size_t, RB_Repeat*>::iterator it = repeat_map_.find(i);
3933         assert(it != repeat_map_.end());
3934         RB_Repeat& repeat = *(it->second);
3935         repeat.update();
3936     }
3937 }
3938 
3939 /**
3940  * TODO
3941  * @brief
3942  *
3943  * @param rpt_seq
3944  * @param rpt_range
3945  */
3946 template<typename TStr>
addRepeatGroup(const RepeatParameter & rp,size_t & repeat_id,RB_RepeatManager & repeat_manager,const RB_RepeatBase & repeatBase,ostream & fp)3947 void RepeatBuilder<TStr>::addRepeatGroup(const RepeatParameter& rp,
3948                                          size_t& repeat_id,
3949                                          RB_RepeatManager& repeat_manager,
3950                                          const RB_RepeatBase& repeatBase,
3951                                          ostream& fp)
3952 {
3953     RB_Repeat* repeat = new RB_Repeat;
3954     repeat->repeat_id(repeat_id);
3955     repeat->init(rp,
3956                  s_,
3957                  coordHelper_,
3958                  subSA_,
3959                  repeatBase);
3960 
3961     assert(repeat_map_.find(repeat->repeat_id()) == repeat_map_.end());
3962     repeat_map_[repeat->repeat_id()] = repeat;
3963     repeat_id++;
3964 }
3965 
3966 template<typename TStr>
checkSequenceMergeable(const string & ref,const string & read,EList<Edit> & edits,Coord & coord,TIndexOffU rpt_len,TIndexOffU max_edit)3967 bool RepeatBuilder<TStr>::checkSequenceMergeable(const string& ref,
3968                                                  const string& read,
3969                                                  EList<Edit>& edits,
3970                                                  Coord& coord,
3971                                                  TIndexOffU rpt_len,
3972                                                  TIndexOffU max_edit)
3973 {
3974     size_t max_matchlen = 0;
3975     EList<Edit> ed;
3976 
3977     string pad;
3978     swaligner_.makePadString(ref, read, pad, 5);
3979 
3980     string ref2 = pad + ref;
3981     string read2 = pad + read;
3982 
3983     swaligner_.alignStrings(ref2, read2, ed, coord);
3984 
3985     // match should start from pad string
3986     if(coord.off() != 0) {
3987         return false;
3988     }
3989 
3990     // no edits on pad string
3991     if(ed.size() > 0 && ed[0].pos < pad.length()) {
3992         return false;
3993     }
3994 
3995     size_t left = pad.length();
3996     size_t right = left + read.length();
3997 
3998     edits.clear();
3999     edits.reserveExact(ed.size());
4000     for(size_t i = 0; i < ed.size(); i++) {
4001         if(ed[i].pos >= left && ed[i].pos <= right) {
4002             edits.push_back(ed[i]);
4003             edits.back().pos -= left;
4004         }
4005     }
4006 
4007     max_matchlen = getMaxMatchLen(edits, read.length());
4008 
4009 #ifdef DEBUGLOG
4010     {
4011         cerr << "After pad removed" << endl;
4012         BTDnaString btread;
4013         btread.install(read.c_str(), true);
4014         Edit::print(cerr, edits); cerr << endl;
4015         Edit::printQAlign(cerr, btread, edits);
4016     }
4017 #endif
4018 
4019     return (max_matchlen >= rpt_len);
4020 }
4021 
4022 
4023 template<typename TStr>
saveRepeats(const RepeatParameter & rp)4024 void RepeatBuilder<TStr>::saveRepeats(const RepeatParameter &rp)
4025 {
4026     ios_base::openmode mode = ios_base::out;
4027     if(rp.append_result) {
4028         mode |= ios_base::app;
4029     } else {
4030         mode |= ios_base::trunc;
4031     }
4032     // Generate SNPs
4033     size_t i = 0;
4034     for(map<size_t, RB_Repeat*>::iterator it = repeat_map_.begin(); it != repeat_map_.end(); it++, i++) {
4035         RB_Repeat& repeat = *(it->second);
4036         if(!repeat.satisfy(rp))
4037             continue;
4038 
4039         // for each repeats
4040         repeat.generateSNPs(rp, s_, i);
4041     }
4042 
4043     // save snp, consensus sequenuce, info
4044     string snp_fname = filename_ + ".rep.snp";
4045     string info_fname = filename_ + ".rep.info";
4046     string hapl_fname = filename_ + ".rep.haplotype";
4047 
4048     ofstream snp_fp(snp_fname.c_str(), mode);
4049     ofstream info_fp(info_fname.c_str(), mode);
4050     ofstream hapl_fp(hapl_fname.c_str(), mode);
4051 
4052     const string repName = "rep" + to_string(rp.min_repeat_len) + "-" + to_string(rp.max_repeat_len);
4053 
4054     i = 0;
4055     TIndexOffU consensus_baseoff = 0;
4056     TIndexOffU snp_id_base = 0;
4057     TIndexOffU hapl_id_base = 0;
4058     for(map<size_t, RB_Repeat*>::iterator it = repeat_map_.begin(); it != repeat_map_.end(); it++, i++) {
4059         RB_Repeat& repeat = *(it->second);
4060         if(!repeat.satisfy(rp))
4061             continue;
4062 
4063         // for each repeats
4064         repeat.saveSNPs(snp_fp, i, snp_id_base);
4065         saveAlleles(rp,
4066                     repName,
4067                     repeat,
4068                     info_fp,
4069                     hapl_fp,
4070                     i,
4071                     hapl_id_base,
4072                     consensus_baseoff);
4073 
4074         consensus_baseoff += repeat.consensus().length();
4075     }
4076 
4077     snp_fp.close();
4078     info_fp.close();
4079     hapl_fp.close();
4080 
4081     // save all consensus sequence
4082     saveConsensus(rp, repName);
4083 }
4084 
4085 template<typename TStr>
saveConsensus(const RepeatParameter & rp,const string & repName)4086 void RepeatBuilder<TStr>::saveConsensus(const RepeatParameter &rp,
4087                                         const string& repName) {
4088     ios_base::openmode mode = ios_base::out;
4089     if(rp.append_result) {
4090         mode |= ios_base::app;
4091     } else {
4092         mode |= ios_base::trunc;
4093     }
4094 
4095     string fa_fname = filename_ + ".rep.fa";
4096     ofstream fa_fp(fa_fname.c_str(), mode);
4097 
4098     fa_fp << ">" << repName << endl;
4099 
4100     size_t oskip = 0;
4101     for(map<size_t, RB_Repeat*>::iterator it = repeat_map_.begin(); it != repeat_map_.end(); it++) {
4102         RB_Repeat& repeat = *(it->second);
4103         if(!repeat.satisfy(rp))
4104             continue;
4105 
4106         // for each repeats
4107         const string& constr = repeat.consensus();
4108         size_t si = 0;
4109         size_t constr_len = constr.length();
4110         while(si < constr_len) {
4111             size_t out_len = std::min((size_t)(output_width - oskip), (size_t)(constr_len - si));
4112             fa_fp << constr.substr(si, out_len);
4113 
4114             if((oskip + out_len) == output_width) {
4115                 fa_fp << endl;
4116                 oskip = 0;
4117             } else {
4118                 // last line
4119                 oskip = oskip + out_len;
4120             }
4121 
4122             si += out_len;
4123         }
4124     }
4125     if(oskip) {
4126         fa_fp << endl;
4127     }
4128 
4129     fa_fp.close();
4130 }
4131 
4132 template<typename TStr>
saveAlleles(const RepeatParameter & rp,const string & repName,RB_Repeat & repeat,ofstream & fp,ofstream & hapl_fp,TIndexOffU grp_id,TIndexOffU & hapl_id_base,TIndexOffU baseoff)4133 void RepeatBuilder<TStr>::saveAlleles(
4134         const RepeatParameter& rp,
4135         const string& repName,
4136         RB_Repeat& repeat,
4137         ofstream& fp,
4138         ofstream& hapl_fp,
4139         TIndexOffU grp_id,
4140         TIndexOffU& hapl_id_base,
4141         TIndexOffU baseoff)
4142 {
4143     const EList<SeedExt>& seeds = repeat.seeds();
4144     EList<SeedHP> haplo_lists;
4145     Range range(0, seeds.size());
4146 
4147     int allele_id = 0;
4148 
4149     for(size_t sb = range.first; sb < range.second;) {
4150         size_t se = sb + 1;
4151         for(; se < range.second; se++) {
4152             if(!(SeedExt::isSameConsensus(seeds[sb], seeds[se])
4153                  && SeedExt::isSameSNPs(seeds[sb], seeds[se])
4154                  && seeds[sb].aligned == seeds[se].aligned)) {
4155                 break;
4156             }
4157         }
4158 
4159         if(!seeds[sb].aligned) {
4160             sb = se;
4161             continue;
4162         }
4163 
4164         if(seeds[sb].getLength() < rp.min_repeat_len) {
4165             sb = se;
4166             continue;
4167         }
4168 
4169         // [sb, se) are same alleles
4170         writeAllele(grp_id, allele_id, Range(sb, se),
4171                     repName, baseoff, seeds, fp);
4172         generateHaploType(Range(sb, se), seeds, haplo_lists);
4173 
4174         allele_id++;
4175         sb = se;
4176     }
4177 
4178     // sort HaploType List by pos
4179     haplo_lists.sort();
4180     writeHaploType(haplo_lists, seeds, hapl_id_base, repName, hapl_fp);
4181 }
4182 
4183 template<typename TStr>
generateHaploType(Range range,const EList<SeedExt> & seeds,EList<SeedHP> & haplo_list)4184 void RepeatBuilder<TStr>::generateHaploType(Range range, const EList<SeedExt> &seeds, EList<SeedHP> &haplo_list)
4185 {
4186     const EList<SeedSNP *>& snps = seeds[range.first].snps;
4187     if(snps.size() == 0)
4188         return;
4189 
4190     // right-most position
4191     TIndexOffU max_right_pos = seeds[range.first].consensus_pos.second - 1;
4192 
4193 #ifndef NDEBUG
4194     for(size_t i = 0; i < snps.size(); i++) {
4195         assert_leq(snps[i]->pos, max_right_pos);
4196     }
4197 #endif
4198 
4199     // create haplotypes of at least 16 bp long to prevent combinations of SNPs
4200     // break a list of SNPs into several haplotypes if a SNP is far from the next SNP in the list
4201     const TIndexOffU min_ht_len = 16;
4202     size_t eb = 0, ee = 1;
4203     while(ee < snps.size() + 1) {
4204         if(ee == snps.size() ||
4205            snps[eb]->pos + (min_ht_len << 1) < snps[ee]->pos) {
4206             TIndexOffU left_pos = snps[eb]->pos;
4207             TIndexOffU right_pos = snps[ee-1]->pos;
4208 
4209             if(snps[ee-1]->type == EDIT_TYPE_READ_GAP) {
4210                 right_pos += snps[ee-1]->len;
4211             }
4212             if(left_pos + min_ht_len - 1 > right_pos) {
4213                 right_pos = left_pos + min_ht_len - 1;
4214             }
4215             right_pos = min<TIndexOffU>(max_right_pos, right_pos);
4216             assert_leq(left_pos, right_pos);
4217 
4218             SeedHP seedHP;
4219 
4220             seedHP.range.first = left_pos;
4221             seedHP.range.second = right_pos;
4222             for(size_t i = eb; i < ee; i++) {
4223                 string snp_ids = "rps" + to_string(snps[i]->id);
4224                 seedHP.snpIDs.push_back(snp_ids);
4225             }
4226 
4227             // Add to haplo_list
4228             bool found = false;
4229             for(size_t i = 0; i < haplo_list.size(); i++) {
4230                 if(haplo_list[i] == seedHP) {
4231                     found = true;
4232                     break;
4233                 }
4234             }
4235             if(!found) {
4236                 // Add
4237                 haplo_list.push_back(seedHP);
4238             }
4239 
4240             eb = ee;
4241         }
4242         ee++;
4243     }
4244 }
4245 
4246 
init(TIndexOffU sa_size,TIndexOffU seed_len,TIndexOffU seed_count)4247 void RB_SubSA::init(TIndexOffU sa_size,
4248                     TIndexOffU seed_len,
4249                     TIndexOffU seed_count)
4250 {
4251     assert_gt(sa_size, 1);
4252     sa_size_ = sa_size;
4253     assert_gt(seed_len, 0);
4254     seed_len_ = seed_len;
4255     assert_gt(seed_count, 1);
4256     seed_count_ = seed_count;
4257     temp_suffixes_.clear();
4258 
4259     repeat_list_.clear();
4260     repeat_index_.clear();
4261 }
4262 
4263 template<typename TStr>
push_back(const TStr & s,CoordHelper & coordHelper,TIndexOffU saElt,bool lastInput)4264 void RB_SubSA::push_back(const TStr& s,
4265                          CoordHelper& coordHelper,
4266                          TIndexOffU saElt,
4267                          bool lastInput)
4268 {
4269     if(saElt + seed_len() <= coordHelper.getEnd(saElt)) {
4270         if(seed_count_ == 1) {
4271             repeat_list_.push_back(saElt);
4272         } else {
4273             assert_gt(seed_count_, 1);
4274 
4275             if(temp_suffixes_.empty()) {
4276                 temp_suffixes_.push_back(saElt);
4277                 return;
4278             }
4279 
4280             TIndexOffU prev_saElt = temp_suffixes_.back();
4281 
4282             // calculate common prefix length between two text.
4283             //   text1 is started from prev_saElt and text2 is started from saElt
4284             bool same = isSameSequenceUpto(s,
4285                                            coordHelper,
4286                                            prev_saElt,
4287                                            saElt,
4288                                            seed_len_);
4289 
4290             if(same) {
4291                 temp_suffixes_.push_back(saElt);
4292             }
4293 
4294             if(!same || lastInput) {
4295                 if(temp_suffixes_.size() >= seed_count_) {
4296                     repeat_index_.push_back(repeat_list_.size());
4297                     temp_suffixes_.sort();
4298                     for(size_t pi = 0; pi < temp_suffixes_.size(); pi++) {
4299                         repeat_list_.push_back(temp_suffixes_[pi]);
4300                     }
4301                 }
4302                 temp_suffixes_.clear();
4303                 if(!lastInput) {
4304                     temp_suffixes_.push_back(saElt);
4305                 } else {
4306                     temp_suffixes_.nullify();
4307                 }
4308             }
4309         }
4310     }
4311 
4312     if(lastInput) {
4313         size_t bit = sizeof(uint32_t) * 8;
4314         size_t num = (repeat_list_.size() + bit - 1) / bit;
4315         done_.resizeExact(num);
4316         done_.fillZero();
4317 
4318 #ifndef NDEBUG
4319         string prev_seq = "";
4320         for(size_t i = 0; i < repeat_index_.size(); i++) {
4321             TIndexOffU saBegin = repeat_index_[i];
4322             TIndexOffU saEnd = (i + 1 < repeat_index_.size() ? repeat_index_[i+1] : repeat_list_.size());
4323             string seq = getString(s, repeat_list_[saBegin], seed_len());
4324             for(size_t j = saBegin + 1; j < saEnd; j++) {
4325                 string tmp_seq = getString(s, repeat_list_[j], seed_len());
4326                 assert_eq(seq, tmp_seq);
4327             }
4328             if(prev_seq != "" ) {
4329                 assert_lt(prev_seq, seq);
4330             }
4331             prev_seq = seq;
4332         }
4333 #endif
4334     }
4335 }
4336 
4337 template<typename TStr>
find(const TStr & s,const string & seq) const4338 Range RB_SubSA::find(const TStr& s,
4339                      const string& seq) const
4340 {
4341     TIndexOffU i = find_repeat_idx(s, seq);
4342     if(i >= repeat_index_.size())
4343         return Range(0, 0);
4344 
4345     Range range(repeat_index_[i],
4346                 i + 1 < repeat_index_.size() ? repeat_index_[i+1] : repeat_list_.size());
4347     return range;
4348 }
4349 
4350 template<typename TStr>
find_repeat_idx(const TStr & s,const string & seq) const4351 TIndexOffU RB_SubSA::find_repeat_idx(const TStr& s,
4352                                      const string& seq) const
4353 {
4354     assert_eq(seq.length(), seed_len_);
4355 
4356     string temp;
4357     size_t l = 0, r = repeat_index_.size();
4358     while(l < r) {
4359         size_t m = (r + l) >> 1;
4360         TIndexOffU saElt = repeat_list_[repeat_index_[m]];
4361         getString(s, saElt, seed_len_, temp);
4362         if(seq == temp) {
4363             return m;
4364         } else if(seq < temp) {
4365             r = m;
4366         } else {
4367             assert(seq > temp);
4368             l = m + 1;
4369         }
4370     }
4371     return repeat_index_.size();
4372 }
4373 
setDone(TIndexOffU off,TIndexOffU len)4374 void RB_SubSA::setDone(TIndexOffU off, TIndexOffU len)
4375 {
4376     assert_leq(off + len, repeat_list_.size());
4377     const TIndexOffU bit = sizeof(uint32_t) * 8;
4378     for(TIndexOffU i = off; i < off + len; i++) {
4379         TIndexOffU quotient = i / bit;
4380         TIndexOffU remainder = i % bit;
4381         assert_lt(quotient, done_.size());
4382         uint32_t num = done_[quotient];
4383         num = num | (1 << remainder);
4384         done_[quotient] = num;
4385     }
4386 }
4387 
isDone(TIndexOffU off,TIndexOffU len) const4388 bool RB_SubSA::isDone(TIndexOffU off, TIndexOffU len) const
4389 {
4390     assert_leq(off + len, repeat_list_.size());
4391     const TIndexOffU bit = sizeof(uint32_t) * 8;
4392     for(TIndexOffU i = off; i < off + len; i++) {
4393         TIndexOffU quotient = i / bit;
4394         TIndexOffU remainder = i % bit;
4395         assert_lt(quotient, done_.size());
4396         uint32_t num = done_[quotient];
4397         num = num & (1 << remainder);
4398         if(num == 0)
4399             return false;
4400     }
4401 
4402     return true;
4403 }
4404 
4405 template<typename TStr>
buildRepeatBase(const TStr & s,CoordHelper & coordHelper,const size_t max_len,EList<RB_RepeatBase> & repeatBases)4406 void RB_SubSA::buildRepeatBase(const TStr& s,
4407                                CoordHelper& coordHelper,
4408                                const size_t max_len,
4409                                EList<RB_RepeatBase>& repeatBases)
4410 {
4411     if(repeat_index_.empty())
4412         return;
4413 
4414     done_.fillZero();
4415 
4416     EList<uint8_t> senseDominant;
4417     senseDominant.resizeExact(repeat_index_.size());
4418     senseDominant.fillZero();
4419 
4420     EList<size_t> repeatStack;
4421     EList<pair<TIndexOffU, TIndexOffU> > size_table;
4422     size_table.reserveExact(repeat_index_.size() / 2 + 1);
4423     for(size_t i = 0; i < repeat_index_.size(); i++) {
4424         TIndexOffU begin = repeat_index_[i];
4425         TIndexOffU end = (i + 1 < repeat_index_.size() ? repeat_index_[i+1] : repeat_list_.size());
4426         assert_lt(begin, end);
4427         EList<TIndexOffU> positions; positions.reserveExact(end - begin);
4428         for(size_t j = begin; j < end; j++) positions.push_back(repeat_list_[j]);
4429 
4430         if(!isSenseDominant(coordHelper, positions, seed_len_))
4431             continue;
4432 
4433         senseDominant[i] = 1;
4434         size_table.expand();
4435         size_table.back().first = end - begin;
4436         size_table.back().second = i;
4437     }
4438     size_table.sort();
4439 
4440     size_t bundle_count = 0;
4441     string tmp_str;
4442     EList<Range> tmp_ranges; tmp_ranges.resizeExact(4);
4443     EList<pair<size_t, size_t> > tmp_sort_ranges; tmp_sort_ranges.resizeExact(4);
4444     for(int64_t i = (int64_t)size_table.size() - 1; i >= 0; i--) {
4445         TIndexOffU idx = size_table[i].second;
4446         ASSERT_ONLY(TIndexOffU num = size_table[i].first);
4447         assert_lt(idx, repeat_index_.size());
4448 #ifndef NDEBUG
4449         if(idx + 1 < repeat_index_.size()) {
4450             assert_eq(repeat_index_[idx] + num, repeat_index_[idx+1]);
4451         } else {
4452             assert_eq(repeat_index_[idx] + num, repeat_list_.size());
4453         }
4454 #endif
4455         TIndexOffU saBegin = repeat_index_[idx];
4456         if(isDone(saBegin)) {
4457             assert(isDone(saBegin, num));
4458             continue;
4459         }
4460 
4461         ASSERT_ONLY(size_t rb_done = 0);
4462         assert_lt(saBegin, repeat_list_.size());
4463         repeatStack.push_back(idx);
4464         while(!repeatStack.empty()) {
4465             TIndexOffU idx = repeatStack.back();
4466             assert(senseDominant[idx]);
4467             repeatStack.pop_back();
4468             assert_lt(idx, repeat_index_.size());
4469             TIndexOffU saBegin = repeat_index_[idx];
4470             TIndexOffU saEnd = (idx + 1 < repeat_index_.size() ? repeat_index_[idx + 1] : repeat_list_.size());
4471             if(isDone(saBegin)) {
4472                 assert(isDone(saBegin, saEnd - saBegin));
4473                 continue;
4474             }
4475 
4476             TIndexOffU saElt = repeat_list_[saBegin];
4477             size_t ri = repeatBases.size();
4478             repeatBases.expand();
4479             repeatBases.back().seq = getString(s, saElt, seed_len_);
4480             repeatBases.back().nodes.clear();
4481             repeatBases.back().nodes.push_back(idx);
4482             ASSERT_ONLY(rb_done++);
4483             setDone(saBegin, saEnd - saBegin);
4484             bool left = true;
4485             while(repeatBases[ri].seq.length() <= max_len) {
4486                 if(left) {
4487                     tmp_str = "N";
4488                     tmp_str += repeatBases[ri].seq.substr(0, seed_len_ - 1);
4489                 } else {
4490                     tmp_str = repeatBases[ri].seq.substr(repeatBases[ri].seq.length() - seed_len_ + 1, seed_len_ - 1);
4491                     tmp_str.push_back('N');
4492                 }
4493                 assert_eq(tmp_str.length(), seed_len_);
4494 
4495                 for(size_t c = 0; c < 4; c++) {
4496                     if(left) tmp_str[0] = "ACGT"[c];
4497                     else     tmp_str.back() = "ACGT"[c];
4498                     assert_eq(tmp_str.length(), seed_len_);
4499                     TIndexOffU idx = find_repeat_idx(s, tmp_str);
4500                     size_t num = 0;
4501                     if(idx < repeat_index_.size()) {
4502                         if(idx + 1 < repeat_index_.size()) {
4503                             num = repeat_index_[idx+1] - repeat_index_[idx];
4504                         } else {
4505                             num = repeat_list_.size() - repeat_index_[idx];
4506                         }
4507                     }
4508                     tmp_ranges[c].first = idx;
4509                     tmp_ranges[c].second = num;
4510                     assert(num == 0 || num >= seed_count_);
4511                     tmp_sort_ranges[c].first = num;
4512                     tmp_sort_ranges[c].second = c;
4513                     if(idx == repeat_index_.size() ||
4514                        isDone(repeat_index_[idx]) ||
4515                        !senseDominant[idx]) {
4516 #ifndef NDEBUG
4517                         if(idx < repeat_index_.size()) {
4518                             assert(isDone(repeat_index_[idx], num) ||
4519                                    !senseDominant[idx]);
4520                         }
4521 #endif
4522                         tmp_sort_ranges[c].first = 0;
4523                         tmp_ranges[c].second = 0;
4524                     }
4525                 }
4526                 tmp_sort_ranges.sort();
4527                 if(tmp_sort_ranges[3].first < seed_count_) {
4528                     if(left) {
4529                         left = false;
4530                         continue;
4531                     } else {
4532                         break;
4533                     }
4534                 }
4535 
4536                 for(size_t cc = 0; cc < 3; cc++) {
4537                     assert_leq(tmp_sort_ranges[cc].first, tmp_sort_ranges[cc+1].first);
4538                     if(tmp_sort_ranges[cc].first < seed_count_)
4539                         continue;
4540 
4541                     size_t c = tmp_sort_ranges[cc].second;
4542                     repeatStack.push_back(tmp_ranges[c].first);
4543                 }
4544 
4545                 size_t c = tmp_sort_ranges[3].second;
4546                 if(repeatBases[ri].seq.length() >= max_len) {
4547                     assert_eq(repeatBases[ri].seq.length(), max_len);
4548                     TIndexOffU idx = tmp_ranges[c].first;
4549                     assert(!isDone(repeat_index_[idx]));
4550                     repeatStack.push_back(idx);
4551                     if(left) {
4552                         left = false;
4553                         continue;
4554                     } else {
4555                         break;
4556                     }
4557                 } else {
4558                     TIndexOffU idx = tmp_ranges[c].first;
4559                     TIndexOffU num = tmp_ranges[c].second;
4560                     setDone(repeat_index_[idx], num);
4561                     if(left) {
4562                         repeatBases[ri].seq.insert(0, 1, "ACGT"[c]);
4563                         repeatBases[ri].nodes.insert(idx, 0);
4564                     } else {
4565                         repeatBases[ri].seq.push_back("ACGT"[c]);
4566                         repeatBases[ri].nodes.push_back(idx);
4567                     }
4568                 }
4569             }
4570         }
4571 
4572         assert_gt(rb_done, 0);
4573         bundle_count++;
4574     }
4575 
4576     cerr << "Bundle count: " << bundle_count << endl;
4577 
4578 #ifndef NDEBUG
4579     {
4580         set<TIndexOffU> idx_set;
4581         for(size_t i = 0; i < repeatBases.size(); i++) {
4582             const RB_RepeatBase& repeatBase = repeatBases[i];
4583             for(size_t j = 0; j < repeatBase.nodes.size(); j++) {
4584                 assert(idx_set.find(repeatBase.nodes[j]) == idx_set.end());
4585                 idx_set.insert(repeatBase.nodes[j]);
4586             }
4587         }
4588     }
4589 #endif
4590 
4591 #if 0
4592     {
4593         EList<pair<size_t, size_t> > kmer_table;
4594         string query;
4595         size_t total = 0, match = 0;
4596         size_t interval = 1;
4597         if(repeat_index_.size() >= 10000) {
4598             interval = repeat_index_.size() / 10000;
4599         }
4600         EList<TIndexOffU> positions;
4601         for(size_t i = 0; i < repeat_index_.size(); i += interval) {
4602             TIndexOffU saElt_idx = repeat_index_[i];
4603             TIndexOffU saElt_idx_end = (i + 1 < repeat_index_.size() ? repeat_index_[i+1] : repeat_list_.size());
4604             positions.clear();
4605             for(size_t j = saElt_idx; j < saElt_idx_end; j++) {
4606                 positions.push_back(repeat_list_[j]);
4607 #ifndef NDEBUG
4608                 if(j > saElt_idx) {
4609                     TIndexOffU lcp_len = getLCP(s,
4610                                                 coordHelper,
4611                                                 positions[0],
4612                                                 positions.back(),
4613                                                 seed_len_);
4614                     assert_geq(lcp_len, seed_len_);
4615                 }
4616 
4617                 TIndexOffU saElt = repeat_list_[j];
4618                 TIndexOffU start = coordHelper.getStart(saElt);
4619                 TIndexOffU start2 = coordHelper.getStart(saElt + seed_len_ - 1);
4620                 assert_eq(start, start2);
4621 #endif
4622             }
4623 
4624             TIndexOffU saElt = repeat_list_[saElt_idx];
4625             size_t true_count = saElt_idx_end - saElt_idx;
4626             getString(s, saElt, seed_len_, query);
4627 
4628             total++;
4629 
4630             size_t count = 0;
4631             for(size_t r = 0; r < repeatBases.size(); r++) {
4632                 const RB_RepeatBase& repeat = repeatBases[r];
4633                 int pos = repeat.seq.find(query);
4634                 if(pos != string::npos) {
4635                     for(size_t j = 0; j < repeat.nodes.size(); j++) {
4636                         TIndexOffU _node = repeat.nodes[j];
4637                         TIndexOffU _saElt_idx = repeat_index_[_node];
4638                         TIndexOffU _saElt_idx_end = (_node + 1 < repeat_index_.size() ? repeat_index_[_node+1] : repeat_list_.size());
4639                         TIndexOffU _saElt = repeat_list_[_saElt_idx];
4640                         string seq = getString(s, _saElt, seed_len());
4641                         if(query == seq) {
4642                             count += (_saElt_idx_end - _saElt_idx);
4643                         }
4644                     }
4645                 }
4646             }
4647 
4648             size_t rc_count = 0;
4649             string rc_query = reverse_complement(query);
4650             for(size_t r = 0; r < repeatBases.size(); r++) {
4651                 const RB_RepeatBase& repeat = repeatBases[r];
4652                 int pos = repeat.seq.find(rc_query);
4653                 if(pos != string::npos) {
4654                     for(size_t j = 0; j < repeat.nodes.size(); j++) {
4655                         TIndexOffU _node = repeat.nodes[j];
4656                         TIndexOffU _saElt_idx = repeat_index_[_node];
4657                         TIndexOffU _saElt_idx_end = (_node + 1 < repeat_index_.size() ? repeat_index_[_node+1] : repeat_list_.size());
4658                         TIndexOffU _saElt = repeat_list_[_saElt_idx];
4659                         string seq = getString(s, _saElt, seed_len());
4660                         if(rc_query == seq) {
4661                             rc_count += (_saElt_idx_end - _saElt_idx);
4662                         }
4663                     }
4664                 }
4665             }
4666 
4667             if(count == true_count || rc_count == true_count) {
4668                 match++;
4669             } else if(total - match <= 10) {
4670                 cerr << "query: " << query << endl;
4671                 cerr << "true count: " << true_count << endl;
4672                 cerr << "found count: " << count << endl;
4673                 cerr << "rc found count: " << rc_count << endl;
4674                 cerr << endl;
4675             }
4676         }
4677 
4678         cerr << "RB_SubSA: sanity check: " << match << " passed (out of " << total << ")" << endl << endl;
4679     }
4680 #endif
4681 }
4682 
4683 
4684 #define write_fp(x) fp.write((const char *)&(x), sizeof((x)))
4685 
writeFile(ofstream & fp)4686 void RB_SubSA::writeFile(ofstream& fp)
4687 {
4688     write_fp(sa_size_);
4689     write_fp(seed_len_);
4690     write_fp(seed_count_);
4691 
4692     size_t sz = temp_suffixes_.size();
4693     write_fp(sz);
4694     for(size_t i = 0; i < sz; i++) {
4695         write_fp(temp_suffixes_[i]);
4696     }
4697 
4698     sz = repeat_index_.size();
4699     write_fp(sz);
4700     for(size_t i = 0; i < sz; i++) {
4701         write_fp(repeat_index_[i]);
4702     }
4703 
4704     sz = done_.size();
4705     write_fp(sz);
4706     for(size_t i = 0; i < sz; i++) {
4707         write_fp(done_[i]);
4708     }
4709 }
4710 
4711 #define read_fp(x) fp.read((char *)&(x), sizeof((x)))
4712 
readFile(ifstream & fp)4713 void RB_SubSA::readFile(ifstream& fp)
4714 {
4715     TIndexOffU val;
4716 
4717     read_fp(val);
4718     rt_assert_eq(val, sa_size_);
4719 
4720     read_fp(val);
4721     rt_assert_eq(val, seed_len_);
4722 
4723     read_fp(val);
4724     rt_assert_eq(val, seed_count_);
4725 
4726     size_t sz;
4727     read_fp(sz);
4728     temp_suffixes_.resizeExact(sz);
4729     for(size_t i = 0; i < sz; i++) {
4730         read_fp(temp_suffixes_[i]);
4731     }
4732 
4733     size_t val_sz;
4734 
4735     // repeat_index_
4736     read_fp(val_sz);
4737     repeat_index_.resizeExact(val_sz);
4738     for(size_t i = 0; i < val_sz; i++) {
4739         read_fp(repeat_index_[i]);
4740     }
4741 
4742     // done
4743     read_fp(val_sz);
4744     done_.resizeExact(val_sz);
4745     for(size_t i = 0; i < val_sz; i++) {
4746         read_fp(done_[i]);
4747     }
4748 
4749 }
4750 
4751 
4752 /****************************/
4753 template class RepeatBuilder<SString<char> >;
4754 template void dump_tstr(const SString<char>& );
4755 template bool compareRepeatCoordByJoinedOff(const RepeatCoord<TIndexOffU>& , const RepeatCoord<TIndexOffU>&);
4756