1
2 /*
3 * Copyright 2018, Chanhee Park <parkchanhee@gmail.com> and Daehwan Kim <infphilo@gmail.com>
4 *
5 * This file is part of HISAT 2.
6 *
7 * HISAT 2 is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HISAT 2 is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HISAT 2. If not, see <http://www.gnu.org/licenses/>.
19 */
20
21 #include <iostream>
22 #include <vector>
23 #include <algorithm>
24 #include <cmath>
25 #include "timer.h"
26 #include "aligner_sw.h"
27 #include "aligner_result.h"
28 #include "scoring.h"
29 #include "sstring.h"
30
31 #include "repeat_builder.h"
32 #include "repeat_kmer.h"
33 #include "bit_packed_array.h"
34
levenshtein_distance(const std::string & s1,const std::string & s2)35 unsigned int levenshtein_distance(const std::string& s1, const std::string& s2)
36 {
37 const std::size_t len1 = s1.size(), len2 = s2.size();
38 std::vector<unsigned int> col(len2+1), prevCol(len2+1);
39
40 for (unsigned int i = 0; i < prevCol.size(); i++)
41 prevCol[i] = i;
42 for (unsigned int i = 0; i < len1; i++) {
43 col[0] = i+1;
44 for (unsigned int j = 0; j < len2; j++)
45 // note that std::min({arg1, arg2, arg3}) works only in C++11,
46 // for C++98 use std::min(std::min(arg1, arg2), arg3)
47 col[j+1] = std::min( std::min(prevCol[1 + j] + 1, col[j] + 1), prevCol[j] + (s1[i]==s2[j] ? 0 : 1) );
48 col.swap(prevCol);
49 }
50 return prevCol[len2];
51 }
52
53
hamming_distance(const string & s1,const string & s2)54 unsigned int hamming_distance(const string& s1, const string& s2)
55 {
56 assert_eq(s1.length(), s2.length());
57
58 uint32_t cnt = 0;
59
60 for(size_t i = 0; i < s1.length(); i++) {
61 if(s1[i] != s2[i]) {
62 cnt++;
63 }
64 }
65
66 return cnt;
67 }
68
reverse(const string & str)69 string reverse(const string& str)
70 {
71 string rev = str;
72 size_t str_len = str.length();
73
74 for(size_t i = 0; i < str_len; i++) {
75 rev[i] = str[str_len - i - 1];
76 }
77
78 return rev;
79 }
80
reverse_complement(const string & str)81 string reverse_complement(const string& str)
82 {
83 string rev = str;
84 size_t str_len = str.length();
85
86 for(size_t i = 0; i < str_len; i++) {
87 char nt = str[str_len - i - 1];
88 rev[i] = asc2dnacomp[nt];
89 }
90
91 return rev;
92 }
93
94 template<typename TStr>
getLCP(const TStr & s,CoordHelper & coordHelper,TIndexOffU a,TIndexOffU b,TIndexOffU max_len=1000)95 TIndexOffU getLCP(const TStr& s,
96 CoordHelper& coordHelper,
97 TIndexOffU a,
98 TIndexOffU b,
99 TIndexOffU max_len = 1000)
100 {
101 TIndexOffU a_end = coordHelper.getEnd(a);
102 TIndexOffU b_end = coordHelper.getEnd(b);
103
104 assert_leq(a_end, s.length());
105 assert_leq(b_end, s.length());
106
107 TIndexOffU k = 0;
108 while((a + k) < a_end && (b + k) < b_end && k < max_len) {
109 if(s[a + k] != s[b + k]) {
110 break;
111 }
112 k++;
113 }
114
115 return k;
116 }
117
118 template<typename TStr>
isSameSequenceUpto(const TStr & s,CoordHelper & coordHelper,TIndexOffU a,TIndexOffU b,TIndexOffU upto)119 bool isSameSequenceUpto(const TStr& s,
120 CoordHelper& coordHelper,
121 TIndexOffU a,
122 TIndexOffU b,
123 TIndexOffU upto)
124 {
125 TIndexOffU a_end = coordHelper.getEnd(a);
126 TIndexOffU b_end = coordHelper.getEnd(b);
127 assert_leq(a_end, s.length());
128 assert_leq(b_end, s.length());
129
130 if(a + upto > a_end || b + upto > b_end)
131 return false;
132
133 for(int k = upto - 1; k >= 0; k--) {
134 if(s[a + k] != s[b + k]) {
135 return false;
136 }
137 }
138
139 return true;
140 }
141
isSenseDominant(CoordHelper & coordHelper,const EList<TIndexOffU> & positions,size_t seed_len)142 bool isSenseDominant(CoordHelper& coordHelper,
143 const EList<TIndexOffU>& positions,
144 size_t seed_len)
145 {
146 // count the number of seeds on the sense strand
147 size_t sense_mer_count = 0;
148 for(size_t i = 0; i < positions.size(); i++) {
149 if(positions[i] < coordHelper.forward_length())
150 sense_mer_count++;
151 }
152
153 // there exists two sets of seeds that are essentially the same
154 // due to sense and antisense strands except palindromic sequences
155 // choose one and skip the other one
156 assert_leq(sense_mer_count, positions.size());
157 size_t antisense_mer_count = positions.size() - sense_mer_count;
158 if(sense_mer_count < antisense_mer_count) {
159 return false;
160 } else if(sense_mer_count == antisense_mer_count) {
161 assert_geq(positions.back(), coordHelper.forward_length());
162 TIndexOffU sense_pos = coordHelper.length() - positions.back() - seed_len;
163 if(positions[0] > sense_pos) return false;
164 }
165
166 return true;
167 }
168
169 static const Range EMPTY_RANGE = Range(1, 0);
170
171 struct RepeatRange {
RepeatRangeRepeatRange172 RepeatRange() {
173 forward = true;
174 };
RepeatRangeRepeatRange175 RepeatRange(Range r, int id) :
176 range(r), rg_id(id), forward(true) {};
RepeatRangeRepeatRange177 RepeatRange(Range r, int id, bool fw) :
178 range(r), rg_id(id), forward(fw) {};
179
180 Range range;
181 int rg_id;
182 bool forward;
183 };
184
reverseRange(const Range & r,TIndexOffU size)185 Range reverseRange(const Range& r, TIndexOffU size)
186 {
187 size_t len = r.second - r.first;
188 Range rc;
189
190 rc.first = size - r.second;
191 rc.second = rc.first + len;
192
193 return rc;
194 }
195
reverseComplement(const string & str)196 string reverseComplement(const string& str)
197 {
198 string rc;
199 for(TIndexOffU si = str.length(); si > 0; si--) {
200 rc.push_back(asc2dnacomp[str[si - 1]]);
201 }
202 return rc;
203 }
204
205
toMDZ(const EList<Edit> & edits,const string & read)206 string toMDZ(const EList<Edit>& edits, const string& read)
207 {
208 StackedAln stk;
209 BTDnaString btread;
210 BTString buf;
211
212 btread.install(read.c_str(), true);
213
214 stk.init(btread, edits, 0, 0, 0, 0);
215 stk.buildMdz();
216 stk.writeMdz(&buf, NULL);
217
218 return string(buf.toZBuf());
219 }
220
applyEdits(const string & ref,size_t read_len,const EList<Edit> & edits,const Coord & coord)221 string applyEdits(const string& ref, size_t read_len, const EList<Edit>& edits, const Coord& coord)
222 {
223 string read;
224 size_t ref_pos = coord.off();
225 size_t read_pos = 0;
226
227 for(size_t i = 0; i < edits.size(); i++) {
228 for(; read_pos < edits[i].pos; read_pos++, ref_pos++) {
229 read.push_back(ref[ref_pos]);
230 }
231
232 if(edits[i].type == EDIT_TYPE_READ_GAP) {
233 // delete on read
234 ref_pos++;
235 } else if(edits[i].type == EDIT_TYPE_REF_GAP) {
236 // insert on read
237 read.push_back(edits[i].qchr);
238 read_pos++;
239 } else if(edits[i].type == EDIT_TYPE_MM) {
240 assert_eq(ref[ref_pos], edits[i].chr);
241 read.push_back(edits[i].qchr);
242
243 read_pos++;
244 ref_pos++;
245 } else {
246 assert(false);
247 }
248 }
249
250 for(; read_pos < read_len; read_pos++, ref_pos++) {
251 read.push_back(ref[ref_pos]);
252 }
253
254 return read;
255 }
256
257 template<typename index_t>
compareRepeatCoordByJoinedOff(const RepeatCoord<index_t> & a,const RepeatCoord<index_t> & b)258 static bool compareRepeatCoordByJoinedOff(const RepeatCoord<index_t>& a, const RepeatCoord<index_t>& b)
259 {
260 return a.joinedOff < b.joinedOff;
261 }
262
263
264 template<typename TStr>
getString(const TStr & ref,TIndexOffU start,size_t len)265 string getString(const TStr& ref, TIndexOffU start, size_t len)
266 {
267 ASSERT_ONLY(const size_t ref_len = ref.length());
268 assert_leq(start + len, ref_len);
269
270 string s;
271 for(size_t i = 0; i < len; i++) {
272 char nt = "ACGT"[ref[start + i]];
273 s.push_back(nt);
274 }
275
276 return s;
277 }
278
279 template<typename TStr>
getString(const TStr & ref,TIndexOffU start,size_t len,string & s)280 void getString(const TStr& ref, TIndexOffU start, size_t len, string& s)
281 {
282 s.clear();
283 ASSERT_ONLY(const size_t ref_len = ref.length());
284 assert_leq(start + len, ref_len);
285 for(size_t i = 0; i < len; i++) {
286 char nt = "ACGT"[ref[start + i]];
287 s.push_back(nt);
288 }
289 }
290
291
292 template<typename TStr>
getSequenceBase(const TStr & ref,TIndexOffU pos)293 inline uint8_t getSequenceBase(const TStr& ref, TIndexOffU pos)
294 {
295 assert_lt(pos, ref.length());
296 return (uint8_t)ref[pos];
297 }
298
299
300 template<typename TStr>
dump_tstr(const TStr & s)301 void dump_tstr(const TStr& s)
302 {
303 static int print_width = 60;
304
305 size_t s_len = s.length();
306
307 for(size_t i = 0; i < s_len; i += print_width) {
308 string buf;
309 for(size_t j = 0; (j < print_width) && (i + j < s_len); j++) {
310 buf.push_back("ACGTN"[s[i + j]]);
311 }
312 cerr << buf << endl;
313 }
314 cerr << endl;
315 }
316
317
getMaxMatchLen(const EList<Edit> & edits,const size_t read_len)318 size_t getMaxMatchLen(const EList<Edit>& edits, const size_t read_len)
319 {
320 size_t max_length = 0;
321 uint32_t last_edit_pos = 0;
322 uint32_t len = 0;
323
324 if (edits.size() == 0) {
325 // no edits --> exact match
326 return read_len;
327 }
328
329 for(size_t i = 0; i < edits.size(); i++) {
330 if (last_edit_pos > edits[i].pos) {
331 continue;
332 }
333
334 len = edits[i].pos - last_edit_pos;
335 if(len > max_length) {
336 max_length = len;
337 }
338
339 last_edit_pos = edits[i].pos + 1;
340 }
341
342 if (last_edit_pos < read_len) {
343 len = read_len - last_edit_pos;
344 if(len > max_length) {
345 max_length = len;
346 }
347 }
348
349 return max_length;
350 }
351
max_index(size_t array[4])352 int max_index(size_t array[4])
353 {
354 int max_idx = 0;
355
356 for(size_t i = 1; i < 4; i++) {
357 if(array[max_idx] < array[i]) {
358 max_idx = i;
359 }
360 }
361
362 return max_idx;
363 }
364
CoordHelper(TIndexOffU length,TIndexOffU forward_length,const EList<RefRecord> & szs,const EList<string> & ref_names)365 CoordHelper::CoordHelper(TIndexOffU length,
366 TIndexOffU forward_length,
367 const EList<RefRecord>& szs,
368 const EList<string>& ref_names) :
369 length_(length),
370 forward_length_(forward_length),
371 szs_(szs),
372 ref_namelines_(ref_names)
373 {
374 // build ref_names_ from ref_namelines_
375 buildNames();
376 buildJoinedFragment();
377 }
378
~CoordHelper()379 CoordHelper::~CoordHelper()
380 {
381 }
382
buildNames()383 void CoordHelper::buildNames()
384 {
385 ref_names_.resize(ref_namelines_.size());
386 for(size_t i = 0; i < ref_namelines_.size(); i++) {
387 string& nameline = ref_namelines_[i];
388
389 for(size_t j = 0; j < nameline.length(); j++) {
390 char n = nameline[j];
391 if(n == ' ') {
392 break;
393 }
394 ref_names_[i].push_back(n);
395 }
396 }
397 }
398
mapJoinedOffToSeq(TIndexOffU joinedOff)399 int CoordHelper::mapJoinedOffToSeq(TIndexOffU joinedOff)
400 {
401 /* search from cached_list */
402 if(num_cached_ > 0) {
403 for(size_t i = 0; i < num_cached_; i++) {
404 Fragments *frag = &cached_[i];
405 if(frag->contain(joinedOff)) {
406 return frag->frag_id;
407 }
408 }
409 /* fall through */
410 }
411
412 /* search list */
413 int top = 0;
414 int bot = fraglist_.size() - 1;
415 int pos = 0;
416
417 Fragments *frag = &fraglist_[pos];
418 while((bot - top) > 1) {
419 pos = top + ((bot - top) >> 1);
420 frag = &fraglist_[pos];
421
422 if (joinedOff < frag->joinedOff) {
423 bot = pos;
424 } else {
425 top = pos;
426 }
427 }
428
429 frag = &fraglist_[top];
430 if (frag->contain(joinedOff)) {
431 // update cache
432 if (num_cached_ < CACHE_SIZE_JOINEDFRG) {
433 cached_[num_cached_] = *frag;
434 num_cached_++;
435 } else {
436 cached_[victim_] = *frag;
437 victim_++; // round-robin
438 victim_ %= CACHE_SIZE_JOINEDFRG;
439 }
440
441 return top;
442 }
443
444 return -1;
445 }
446
getGenomeCoord(TIndexOffU joinedOff,string & chr_name,TIndexOffU & pos_in_chr)447 int CoordHelper::getGenomeCoord(TIndexOffU joinedOff,
448 string& chr_name,
449 TIndexOffU& pos_in_chr)
450 {
451 int seq_id = mapJoinedOffToSeq(joinedOff);
452 if (seq_id < 0) {
453 return -1;
454 }
455
456 Fragments *frag = &fraglist_[seq_id];
457 TIndexOffU offset = joinedOff - frag->joinedOff;
458
459 pos_in_chr = frag->seqOff + offset;
460 chr_name = ref_names_[frag->seq_id];
461
462 return 0;
463 }
464
buildJoinedFragment()465 void CoordHelper::buildJoinedFragment()
466 {
467 TIndexOffU acc_joinedOff = 0;
468 TIndexOffU acc_seqOff = 0;
469 TIndexOffU seq_id = 0;
470 TIndexOffU frag_id = 0;
471 for(size_t i = 0; i < szs_.size(); i++) {
472 const RefRecord& rec = szs_[i];
473 if(rec.len == 0) {
474 continue;
475 }
476 if (rec.first) {
477 acc_seqOff = 0;
478 seq_id++;
479 }
480
481 fraglist_.expand();
482
483 fraglist_.back().joinedOff = acc_joinedOff;
484 fraglist_.back().length = rec.len;
485
486 fraglist_.back().frag_id = frag_id++;
487 fraglist_.back().seq_id = seq_id - 1;
488 fraglist_.back().first = rec.first;
489
490 acc_seqOff += rec.off;
491 fraglist_.back().seqOff = acc_seqOff;
492
493 acc_joinedOff += rec.len;
494 acc_seqOff += rec.len;
495 }
496
497 // Add Last Fragment(empty)
498 fraglist_.expand();
499 fraglist_.back().joinedOff = acc_joinedOff;
500 fraglist_.back().length = 0;
501 fraglist_.back().seqOff = acc_seqOff;
502 fraglist_.back().first = false;
503 fraglist_.back().frag_id = frag_id;
504 fraglist_.back().seq_id = seq_id;
505 }
506
getEnd(TIndexOffU e)507 TIndexOffU CoordHelper::getEnd(TIndexOffU e) {
508 assert_lt(e, length_)
509
510 TIndexOffU end = 0;
511 if(e < forward_length_) {
512 int frag_id = mapJoinedOffToSeq(e);
513 assert_geq(frag_id, 0);
514 end = fraglist_[frag_id].joinedOff + fraglist_[frag_id].length;
515 } else {
516 // ReverseComplement
517 // a, b are positions w.r.t reverse complement string.
518 // fragment map is based on forward string
519 int frag_id = mapJoinedOffToSeq(length_ - e - 1);
520 assert_geq(frag_id, 0);
521 end = length_ - fraglist_[frag_id].joinedOff;
522 }
523
524 assert_leq(end, length_);
525 return end;
526 }
527
getStart(TIndexOffU e)528 TIndexOffU CoordHelper::getStart(TIndexOffU e) {
529 assert_lt(e, length_)
530
531 TIndexOffU start = 0;
532 if(e < forward_length_) {
533 int frag_id = mapJoinedOffToSeq(e);
534 assert_geq(frag_id, 0);
535 start = fraglist_[frag_id].joinedOff;
536 } else {
537 // ReverseComplement
538 // a, b are positions w.r.t reverse complement string.
539 // fragment map is based on forward string
540 int frag_id = mapJoinedOffToSeq(length_ - e - 1);
541 assert_geq(frag_id, 0);
542 start = length_ - (fraglist_[frag_id].joinedOff + fraglist_[frag_id].length);
543 }
544
545 assert_leq(start, length_);
546 return start;
547 }
548
549 template<typename TStr>
getExtendedSeedSequence(const TStr & s,string & seq) const550 void SeedExt::getExtendedSeedSequence(const TStr& s,
551 string& seq) const
552 {
553 seq.clear();
554 TIndexOffU prev_end = orig_pos.first;
555 for(size_t j = 0; j < left_gaps.size(); j++) {
556 TIndexOffU curr_end = orig_pos.first - left_gaps[j].first;
557 assert_leq(curr_end, prev_end);
558 if(curr_end < prev_end) {
559 seq = getString(s, curr_end, prev_end - curr_end) + seq;
560 }
561 int gap_len = left_gaps[j].second;
562 assert_neq(gap_len, 0);
563 if(gap_len > 0) { // deletion
564 string gap_str(gap_len, '-');
565 seq = gap_str + seq;
566 } else {
567 curr_end += gap_len;
568 }
569 prev_end = curr_end;
570 }
571 assert_leq(pos.first, prev_end);
572 if(pos.first < prev_end) {
573 seq = getString(s, pos.first, prev_end - pos.first) + seq;
574 }
575 if(orig_pos.second - orig_pos.first > 0)
576 seq += getString(s, orig_pos.first, orig_pos.second - orig_pos.first);
577 TIndexOffU prev_begin = orig_pos.second;
578 for(size_t j = 0; j < right_gaps.size(); j++) {
579 TIndexOffU curr_begin = orig_pos.second + right_gaps[j].first;
580 assert_leq(prev_begin, curr_begin);
581 if(prev_begin < curr_begin) {
582 seq += getString(s, prev_begin, curr_begin - prev_begin);
583 }
584 int gap_len = right_gaps[j].second;
585 assert_neq(gap_len, 0);
586 if(gap_len > 0) { // deletion
587 string gap_str(gap_len, '-');
588 seq += gap_str;
589 } else {
590 curr_begin -= gap_len;
591 }
592 prev_begin = curr_begin;
593 }
594 assert_leq(prev_begin, pos.second);
595 if(prev_begin < pos.second) {
596 seq += getString(s, prev_begin, pos.second - prev_begin);
597 }
598 }
599
lookup_add_SNP(EList<SeedSNP * > & repeat_snps,SeedSNP & snp)600 SeedSNP *lookup_add_SNP(EList<SeedSNP *>& repeat_snps, SeedSNP& snp)
601 {
602 for(size_t i = 0; i < repeat_snps.size(); i++) {
603 if(*repeat_snps[i] == snp) {
604 return repeat_snps[i];
605 }
606 }
607
608 repeat_snps.expand();
609 repeat_snps.back() = new SeedSNP();
610 *(repeat_snps.back()) = snp;
611 return repeat_snps.back();
612 }
613
614 template<typename TStr>
generateSNPs(const TStr & s,const string & consensus,EList<SeedSNP * > & repeat_snps)615 void SeedExt::generateSNPs(const TStr &s, const string& consensus, EList<SeedSNP *>& repeat_snps)
616 {
617 EList<pair<TIndexOffU, int> > gaps;
618
619 // Merge Gaps
620 {
621 TIndexOffU left_ext_len = getLeftExtLength();
622 for(size_t g = 0; g < left_gaps.size(); g++) {
623 gaps.expand();
624 gaps.back().first = left_ext_len - left_gaps[g].first + left_gaps[g].second;
625 gaps.back().second = left_gaps[g].second;
626 }
627 TIndexOffU right_base = orig_pos.second - pos.first;
628 for(size_t g = 0; g < right_gaps.size(); g++) {
629 gaps.expand();
630 gaps.back().first = right_base + right_gaps[g].first;
631 gaps.back().second = right_gaps[g].second;
632 }
633 gaps.sort();
634 }
635
636 //
637 string con_ref;
638 string seq_read;
639 TIndexOffU prev_con_pos = consensus_pos.first;
640 TIndexOffU prev_seq_pos = pos.first;
641
642 for(size_t g = 0; g < gaps.size(); g++) {
643 TIndexOffU curr_seq_pos = pos.first + gaps[g].first;
644 TIndexOffU curr_con_pos = prev_con_pos + (curr_seq_pos - prev_seq_pos);
645
646 con_ref = consensus.substr(prev_con_pos, curr_con_pos - prev_con_pos);
647 seq_read = getString(s, prev_seq_pos, curr_seq_pos - prev_seq_pos);
648 for(size_t l = 0; l < con_ref.length(); l++) {
649 if(seq_read[l] != con_ref[l]) {
650 // Single
651 SeedSNP snp;
652 snp.init(EDIT_TYPE_MM, prev_con_pos + l, 1, seq_read[l]);
653
654 // lookup repeat_snp
655 snps.expand();
656 snps.back() = lookup_add_SNP(repeat_snps, snp);
657 }
658 }
659
660 int gap_len = gaps[g].second;
661 assert_neq(gap_len, 0);
662 if(gap_len > 0) {
663 // Deletion (gap on read)
664
665 string gap_str(gap_len, '-');
666
667 SeedSNP snp;
668 snp.init(EDIT_TYPE_READ_GAP, curr_con_pos, gap_len, consensus.substr(curr_con_pos, gap_len));
669
670 snps.expand();
671 snps.back() = lookup_add_SNP(repeat_snps, snp);
672
673 curr_con_pos += gap_len;
674
675 } else {
676 // Insertion (gap on reference)
677 string gap_str(abs(gap_len), '-');
678
679 SeedSNP snp;
680 snp.init(EDIT_TYPE_REF_GAP, curr_con_pos, abs(gap_len), getString(s, curr_seq_pos, abs(gap_len)));
681
682 snps.expand();
683 snps.back() = lookup_add_SNP(repeat_snps, snp);
684
685 curr_seq_pos += abs(gap_len);
686 }
687
688 prev_con_pos = curr_con_pos;
689 prev_seq_pos = curr_seq_pos;
690 }
691
692 assert_eq(consensus_pos.second - prev_con_pos, pos.second - prev_seq_pos);
693 con_ref = consensus.substr(prev_con_pos, consensus_pos.second - prev_con_pos);
694 seq_read = getString(s, prev_seq_pos, pos.second - prev_seq_pos);
695
696 for(size_t l = 0; l < con_ref.length(); l++) {
697 if(seq_read[l] != con_ref[l]) {
698 // Single
699
700 SeedSNP snp;
701 snp.init(EDIT_TYPE_MM, prev_con_pos + l, 1, seq_read[l]);
702
703 snps.expand();
704 snps.back() = lookup_add_SNP(repeat_snps, snp);
705 }
706 }
707 }
708
seedCmp(const SeedExt & s,const SeedExt & s2)709 bool seedCmp(const SeedExt& s, const SeedExt& s2)
710 {
711 if(s.getLength() != s2.getLength())
712 return s.getLength() > s2.getLength();
713 if(s.pos.first != s2.pos.first)
714 return s.pos.first < s2.pos.first;
715 if(s.pos.second != s2.pos.second)
716 return s.pos.second < s2.pos.second;
717 return false;
718 }
719
720 template<typename TStr>
init(const RepeatParameter & rp,const TStr & s,CoordHelper & coordHelper,const RB_SubSA & subSA,const RB_RepeatBase & repeatBase)721 void RB_Repeat::init(const RepeatParameter& rp,
722 const TStr& s,
723 CoordHelper& coordHelper,
724 const RB_SubSA& subSA,
725 const RB_RepeatBase& repeatBase)
726 {
727 consensus_ = repeatBase.seq;
728 assert_geq(consensus_.length(), rp.min_repeat_len);
729 assert_eq(consensus_.length(), rp.min_repeat_len + repeatBase.nodes.size() - 1);
730
731 EList<TIndexOffU> positions, next_positions;
732
733 const EList<TIndexOffU>& repeat_index = subSA.getRepeatIndex();
734 seeds_.clear();
735 for(size_t n = 0; n < repeatBase.nodes.size(); n++) {
736 TIndexOffU idx = repeatBase.nodes[n];
737 TIndexOffU saBegin = repeat_index[idx];
738 TIndexOffU saEnd = (idx + 1 < repeat_index.size() ? repeat_index[idx+1] : subSA.size());
739
740 next_positions.clear();
741 for(size_t sa = saBegin; sa < saEnd; sa++) {
742 TIndexOffU left = subSA.get(sa);
743 TIndexOffU right = left + subSA.seed_len();
744 next_positions.push_back(left);
745
746 if(left > 0) {
747 size_t idx = positions.bsearchLoBound(left - 1);
748 if(idx < positions.size() && positions[idx] == left - 1)
749 continue;
750 }
751
752 seeds_.expand();
753 SeedExt& seed = seeds_.back();
754 seed.reset();
755 seed.orig_pos = pair<TIndexOffU, TIndexOffU>(left, right);
756 seed.pos = seed.orig_pos;
757 seed.consensus_pos.first = n;
758 seed.consensus_pos.second = n + rp.min_repeat_len;
759 seed.bound = pair<TIndexOffU, TIndexOffU>(coordHelper.getStart(left), coordHelper.getEnd(left));
760
761 #ifndef NDEBUG
762 string tmp_str = getString(s, seed.pos.first, seed.pos.second - seed.pos.first);
763 assert_eq(tmp_str, consensus_.substr(n, seed.pos.second - seed.pos.first));
764 #endif
765
766 for(size_t p = seed.consensus_pos.second; p < consensus_.length(); p++) {
767 size_t pos = seed.pos.second;
768 if(pos >= seed.bound.second)
769 break;
770 int nt = getSequenceBase(s, pos);
771 if("ACGT"[nt] != consensus_[p])
772 break;
773 seed.pos.second++;
774 seed.consensus_pos.second++;
775 }
776 }
777 positions = next_positions;
778 }
779
780 internal_update();
781 }
782
783 template<typename TStr>
extendConsensus(const RepeatParameter & rp,const TStr & s)784 void RB_Repeat::extendConsensus(const RepeatParameter& rp,
785 const TStr& s)
786 {
787 size_t remain = seeds_.size();
788 EList<string> left_consensuses, right_consensuses, empty_consensuses;
789 EList<size_t> ed_seed_nums;
790
791 const TIndexOffU default_max_ext_len = (rp.max_repeat_len - rp.seed_len) / 2;
792 const TIndexOffU seed_mm = 1;
793 string left_ext_consensus = "", right_ext_consensus = "";
794
795 empty_consensuses.resize(seed_mm + 1);
796 empty_consensuses.fill("");
797
798 while(remain >= rp.repeat_count) {
799 for(size_t i = 0; i < remain; i++) {
800 seeds_[i].done = false;
801 seeds_[i].curr_ext_len = 0;
802 }
803
804 // extend seeds in left or right direction
805 TIndexOffU max_ext_len = min(default_max_ext_len, (TIndexOffU)(rp.max_repeat_len - consensus_.length()));
806 get_consensus_seq(s,
807 seeds_,
808 0, // seed begin
809 remain, // seed end
810 max_ext_len,
811 max_ext_len,
812 seed_mm,
813 rp,
814 ed_seed_nums,
815 &left_consensuses,
816 &right_consensuses);
817
818 size_t allowed_seed_mm = 0;
819 left_ext_consensus.clear();
820 right_ext_consensus.clear();
821 for(int i = 0 /* (int)seed_mm */; i >= 0; i--) {
822 size_t extlen = (ed_seed_nums[i] < rp.repeat_count ? 0 : (left_consensuses[i].length() + right_consensuses[i].length()));
823 // if(extlen <= 0 || extlen < max_ext_len * i / seed_mm)
824 //if(extlen < max_ext_len * 2)
825 // continue;
826 if(extlen <= 0)
827 continue;
828
829 left_ext_consensus = left_consensuses[i];
830 right_ext_consensus = right_consensuses[i];
831 allowed_seed_mm = (size_t)i;
832 assert_gt(left_ext_consensus.length(), 0);
833 assert_gt(right_ext_consensus.length(), 0);
834 break;
835 }
836 size_t num_passed_seeds = 0;
837 if(left_ext_consensus.length() > 0 && right_ext_consensus.length()) {
838 consensus_ = reverse(left_ext_consensus) + consensus_;
839 consensus_ += right_ext_consensus;
840
841 #if 0
842 calc_edit_dist(s,
843 seeds_,
844 0,
845 remain,
846 left ? consensuses : empty_consensuses,
847 left ? empty_consensuses : consensuses,
848 allowed_seed_mm);
849 #endif
850
851 // update seeds
852 for(size_t i = 0; i < seeds_.size(); i++) {
853 SeedExt& seed = seeds_[i];
854
855 if(i < remain) {
856 if(seed.ed <= allowed_seed_mm) {
857 num_passed_seeds++;
858 seed.done = true;
859 seed.total_ed += seed.ed;
860 seed.pos.first -= left_ext_consensus.length();
861 seed.pos.second += right_ext_consensus.length();
862 seed.consensus_pos.first = 0;
863 seed.consensus_pos.second = consensus_.length();
864
865 if(seed.left_gaps.size() > 0 &&
866 seed.left_gaps.back().first >= seed.orig_pos.first - seed.pos.first) {
867 int gap_len = seed.left_gaps.back().second;
868 seed.pos.first += gap_len;
869 }
870 if(seed.right_gaps.size() > 0 &&
871 seed.right_gaps.back().first >= seed.pos.second - seed.orig_pos.second) {
872 int gap_len = seed.right_gaps.back().second;
873 seed.pos.second -= gap_len;
874 }
875 }
876 }
877 }
878
879 // move up "done" seeds
880 size_t j = 0;
881 for(size_t i = 0; i < remain; i++) {
882 if(!seeds_[i].done) continue;
883 assert_geq(i, j);
884 if(i > j) {
885 SeedExt temp = seeds_[j];
886 seeds_[j] = seeds_[i];
887 seeds_[i] = temp;
888 // Find next "undone" seed
889 j++;
890 while(j < i && seeds_[j].done) {
891 j++;
892 }
893 assert(j < remain && !seeds_[j].done);
894 } else {
895 j = i + 1;
896 }
897 }
898 }
899
900 remain = num_passed_seeds;
901 break;
902 } // while(remain >= rp.repeat_count)
903
904 #ifndef NDEBUG
905 // make sure seed positions are unique
906 EList<pair<TIndexOffU, TIndexOffU> > seed_poses;
907 for(size_t i = 0; i < seeds_.size(); i++) {
908 seed_poses.expand();
909 seed_poses.back() = seeds_[i].orig_pos;
910 }
911 seed_poses.sort();
912 for(size_t i = 0; i + 1 < seed_poses.size(); i++) {
913 if(seed_poses[i].first == seed_poses[i+1].first) {
914 assert_lt(seed_poses[i].second, seed_poses[i+1].second);
915 } else {
916 assert_lt(seed_poses[i].first, seed_poses[i+1].first);
917 }
918 }
919
920 for(size_t i = 0; i < seeds_.size(); i++) {
921 assert(seeds_[i].valid());
922 }
923 #endif
924
925 RB_Repeat::internal_update();
926
927 // check repeats within a repeat sequence
928 self_repeat_ = false;
929 for(size_t i = 0; i + 1 < seed_ranges_.size(); i++) {
930 const RB_AlleleCoord& range = seed_ranges_[i];
931 for(size_t j = i + 1; j < seed_ranges_.size(); j++) {
932 RB_AlleleCoord& range2 = seed_ranges_[j];
933 if(range.right <= range2.left)
934 break;
935 self_repeat_ = true;
936 }
937 }
938 }
939
940 template<typename TStr>
getNextRepeat(const RepeatParameter & rp,const TStr & s,RB_Repeat & o)941 void RB_Repeat::getNextRepeat(const RepeatParameter& rp,
942 const TStr& s,
943 RB_Repeat& o)
944 {
945 o.reset();
946
947 size_t i = 0;
948 for(i = 0; i < seeds_.size() && seeds_[i].done; i++);
949 if(i >= seeds_.size())
950 return;
951
952 for(size_t j = i; j < seeds_.size(); j++) {
953 assert(!seeds_[j].done);
954 o.seeds_.expand();
955 o.seeds_.back() = seeds_[j];
956
957 }
958 seeds_.resizeExact(i);
959 internal_update();
960
961 assert_gt(o.seeds_.size(), 0);
962 o.consensus_ = getString(s, o.seeds_[0].orig_pos.first, o.seeds_[0].orig_pos.second - o.seeds_[0].orig_pos.first);
963 for(size_t j = 0; j < o.seeds_.size(); j++) {
964 o.seeds_[j].pos = o.seeds_[j].orig_pos;
965 o.seeds_[j].left_gaps.clear();
966 o.seeds_[j].right_gaps.clear();
967 o.seeds_[j].ed = o.seeds_[j].total_ed = 0;
968 }
969 }
970
internal_update()971 void RB_Repeat::internal_update()
972 {
973 sort(seeds_.begin(), seeds_.end(), seedCmp);
974
975 seed_ranges_.resizeExact(seeds_.size());
976 for(size_t i = 0; i < seeds_.size(); i++) {
977 seed_ranges_[i].left = seeds_[i].pos.first;
978 seed_ranges_[i].right = seeds_[i].pos.second;
979 seed_ranges_[i].idx = i;
980 }
981 seed_ranges_.sort();
982
983 // check repeats within a repeat sequence
984 size_t remove_count = 0;
985 for(size_t i = 0; i + 1 < seed_ranges_.size(); i++) {
986 const RB_AlleleCoord& range = seed_ranges_[i];
987 if(range.left == numeric_limits<size_t>::max())
988 continue;
989 for(size_t j = i + 1; j < seed_ranges_.size(); j++) {
990 RB_AlleleCoord& range2 = seed_ranges_[j];
991 if(range2.left == numeric_limits<size_t>::max())
992 continue;
993 if(range.right <= range2.left)
994 break;
995 if(range.right >= range2.right) {
996 range2.left = numeric_limits<size_t>::max();
997 remove_count++;
998 }
999 }
1000 }
1001
1002 if(remove_count <= 0)
1003 return;
1004
1005 for(size_t i = 0; i < seed_ranges_.size(); i++) {
1006 if(seed_ranges_[i].left == numeric_limits<size_t>::max())
1007 seeds_[seed_ranges_[i].idx].reset();
1008 }
1009 sort(seeds_.begin(), seeds_.end(), seedCmp);
1010
1011 #ifndef NDEBUG
1012 for(size_t i = 0; i < seeds_.size(); i++) {
1013 if(i < seeds_.size() - remove_count) {
1014 assert_lt(seeds_[i].pos.first, seeds_[i].pos.second);
1015 } else {
1016 assert_eq(seeds_[i].pos.first, seeds_[i].pos.second);
1017 }
1018 }
1019 #endif
1020
1021 seeds_.resize(seeds_.size() - remove_count);
1022 seed_ranges_.resize(seeds_.size());
1023 for(size_t i = 0; i < seeds_.size(); i++) {
1024 seed_ranges_[i].left = seeds_[i].pos.first;
1025 seed_ranges_[i].right = seeds_[i].pos.second;
1026 seed_ranges_[i].idx = i;
1027 }
1028 seed_ranges_.sort();
1029 }
1030
contain(TIndexOffU left,TIndexOffU right) const1031 bool RB_Repeat::contain(TIndexOffU left, TIndexOffU right) const
1032 {
1033 size_t l = 0, r = seed_ranges_.size();
1034 while(l < r) {
1035 size_t m = (l + r) / 2;
1036 const RB_AlleleCoord& coord = seed_ranges_[m];
1037 if(right <= coord.left) {
1038 r = m;
1039 } else if(left >= coord.right) {
1040 l = m + 1;
1041 } else {
1042 return coord.left <= left && right <= coord.right;
1043 }
1044 }
1045
1046 return false;
1047 }
1048
1049 template<typename TStr>
saveSeedExtension(const RepeatParameter & rp,const TStr & s,CoordHelper & coordHelper,TIndexOffU grp_id,ostream & fp,size_t & total_repeat_seq_len,size_t & total_allele_seq_len) const1050 void RB_Repeat::saveSeedExtension(const RepeatParameter& rp,
1051 const TStr& s,
1052 CoordHelper& coordHelper,
1053 TIndexOffU grp_id,
1054 ostream& fp,
1055 size_t& total_repeat_seq_len,
1056 size_t& total_allele_seq_len) const
1057 {
1058 // apply color, which is compatible with linux commands such as cat and less -r
1059 #if 1
1060 const string red = "\033[31m", reset = "\033[0m", resetline = "\033[4m", redline = "\033[4;31m";
1061 #else
1062 const string red = "", reset = "", resetline = "", redline = "";
1063 #endif
1064 bool show_exterior_seq = true;
1065 const size_t max_show_seq_len = 700;
1066
1067 size_t total_count = 0;
1068 for(size_t i = 0; i < seeds_.size(); i++) {
1069 const SeedExt& seed = seeds_[i];
1070 size_t ext_len = seed.getLength();
1071 if(ext_len < rp.min_repeat_len) continue;
1072 total_allele_seq_len += ext_len;
1073 total_count++;
1074 bool sense_strand = seed.pos.first < coordHelper.forward_length();
1075
1076 fp << setw(6) << repeat_id_ << " " << setw(5) << seeds_.size();
1077 fp << " " << setw(4) << i;
1078 fp << " " << setw(4) << ext_len;
1079 fp << " " << (sense_strand ? '+' : '-');
1080 fp << " " << setw(10) << seed.pos.first << " " << setw(10) << seed.pos.second;
1081 fp << " " << setw(10) << seed.orig_pos.first << " " << setw(10) << seed.orig_pos.second;
1082
1083 string chr_name;
1084 TIndexOffU pos_in_chr;
1085 if(sense_strand) {
1086 coordHelper.getGenomeCoord(seed.pos.first, chr_name, pos_in_chr);
1087 } else {
1088 coordHelper.getGenomeCoord(s.length() - seed.pos.first - (seed.pos.second - seed.pos.first), chr_name, pos_in_chr);
1089 }
1090 fp << " " << setw(5) << chr_name << ":" << setw(10) << std::left << pos_in_chr << std::right;
1091
1092 if(sense_strand) {
1093 coordHelper.getGenomeCoord(seed.pos.second, chr_name, pos_in_chr);
1094 } else {
1095 coordHelper.getGenomeCoord(s.length() - seed.pos.second - (seed.pos.second - seed.pos.first), chr_name, pos_in_chr);
1096 }
1097 fp << " " << setw(5) << chr_name << ":" << setw(10) << std::left << pos_in_chr << std::right;
1098
1099 if(!seed.aligned) {
1100 fp << endl;
1101 continue;
1102 }
1103
1104 string deststr = "";
1105 seed.getExtendedSeedSequence(s, deststr);
1106
1107 // add exterior sequences
1108 if(seed.consensus_pos.first > 0) {
1109 TIndexOffU seq_pos, seq_len;
1110 if(seed.pos.first >= seed.consensus_pos.first) {
1111 seq_pos = seed.pos.first - seed.consensus_pos.first;
1112 seq_len = seed.consensus_pos.first;
1113 } else {
1114 seq_pos = 0;
1115 seq_len = seed.pos.first;
1116 }
1117 deststr = getString(s, seq_pos, seq_len) + deststr;
1118 if(seq_len < seed.consensus_pos.first) {
1119 deststr = string(seed.consensus_pos.first - seq_len, 'N') + deststr;
1120 }
1121 }
1122 if(seed.consensus_pos.second < consensus_.length()) {
1123 deststr += getString(s, seed.pos.second, consensus_.length() - seed.consensus_pos.second);
1124 }
1125
1126 assert_eq(consensus_.length(), deststr.length());
1127 fp << " ";
1128
1129 // print sequence w.r.t. the current group
1130 for(size_t j = 0; j < min(consensus_.length(), max_show_seq_len); j++) {
1131 bool outside = j < seed.consensus_pos.first || j >= seed.consensus_pos.second;
1132 bool different = (consensus_[j] != deststr[j]);
1133 if(outside) {
1134 if(show_exterior_seq) {
1135 if(different) {
1136 fp << redline;
1137 fp << deststr[j];
1138 fp << reset;
1139 } else {
1140 fp << resetline;
1141 fp << deststr[j];
1142 fp << reset;
1143 }
1144 } else {
1145 if(j < seed.consensus_pos.first)
1146 fp << " ";
1147 }
1148 } else {
1149 if(different) fp << red;
1150 fp << deststr[j];
1151 if(different) fp << reset;
1152 }
1153 }
1154
1155 #if 0
1156 fp << "\t";
1157 for(size_t ei = 0; ei < seed.edits.size(); ei++) {
1158 const Edit& edit = seed.edits[ei];
1159 if(ei > 0) fp << ",";
1160 fp << edit;
1161 if (edit.snpID != std::numeric_limits<uint32_t>::max()) {
1162 fp << "@" << edit.snpID;
1163 }
1164 }
1165 #endif
1166
1167 fp << endl;
1168 }
1169
1170 if(total_count > 0) fp << setw(5) << total_count << endl << endl;
1171 total_repeat_seq_len += consensus_.length();
1172 }
1173
1174 template<typename TStr>
calc_edit_dist(const TStr & s,const SeedExt & seed,const SeedExt & seed2,size_t left_ext,size_t right_ext,size_t max_ed)1175 size_t calc_edit_dist(const TStr& s,
1176 const SeedExt& seed,
1177 const SeedExt& seed2,
1178 size_t left_ext,
1179 size_t right_ext,
1180 size_t max_ed)
1181 {
1182 if(seed.bound.first + left_ext > seed.pos.first ||
1183 seed.pos.second + right_ext > seed.bound.second ||
1184 seed2.bound.first + left_ext > seed2.pos.first ||
1185 seed2.pos.second + right_ext > seed2.bound.second)
1186 return max_ed + 1;
1187
1188 size_t ed = 0;
1189 for(size_t i = 0; i < left_ext; i++) {
1190 int ch = getSequenceBase(s, seed.pos.first - i - 1);
1191 assert_range(0, 3, ch);
1192 int ch2 = getSequenceBase(s, seed2.pos.first - i - 1);
1193 assert_range(0, 3, ch2);
1194 if(ch != ch2) ed++;
1195 if(ed > max_ed)
1196 return ed;
1197
1198 }
1199 for(size_t i = 0; i < right_ext; i++) {
1200 int ch = getSequenceBase(s, seed.pos.second + i);
1201 assert_range(0, 3, ch);
1202 int ch2 = getSequenceBase(s, seed2.pos.second + i);
1203 assert_range(0, 3, ch2);
1204 if(ch != ch2) ed++;
1205 if(ed > max_ed)
1206 return ed;
1207 }
1208
1209 return ed;
1210 }
1211
extract_kmer(const string & seq,size_t offset,size_t k=5)1212 size_t extract_kmer(const string& seq, size_t offset, size_t k = 5)
1213 {
1214 assert_leq(offset + k, seq.length());
1215 size_t kmer = 0;
1216 for(size_t i = 0; i < k; i++) {
1217 kmer = (kmer << 2) | asc2dna[seq[offset + i]];
1218 }
1219 return kmer;
1220 }
1221
next_kmer(size_t kmer,char base,size_t k=5)1222 size_t next_kmer(size_t kmer, char base, size_t k = 5)
1223 {
1224 kmer &= ((1 << ((k-1)*2))) - 1;
1225 kmer = (kmer << 2) | asc2dna[base];
1226 return kmer;
1227 }
1228
build_kmer_table(const string & consensus,EList<pair<size_t,size_t>> & kmer_table,size_t k=5)1229 void build_kmer_table(const string& consensus, EList<pair<size_t, size_t> >& kmer_table, size_t k = 5)
1230 {
1231 kmer_table.clear();
1232 if(consensus.length() < k)
1233 return;
1234 size_t kmer = 0;
1235 for(size_t i = 0; i + k <= consensus.length(); i++) {
1236 if(i == 0) {
1237 kmer = extract_kmer(consensus, i, k);
1238 } else {
1239 kmer = next_kmer(kmer, consensus[i+k-1], k);
1240 }
1241 kmer_table.expand();
1242 kmer_table.back().first = kmer;
1243 kmer_table.back().second = i;
1244 }
1245 kmer_table.sort();
1246 }
1247
find_gap_pos(const string & s,const string & s2,EList<size_t> & ed,EList<size_t> & ed2,bool del,size_t gap_len,size_t max_mm,size_t & gap_pos,size_t & mm,bool debug=false)1248 void find_gap_pos(const string& s,
1249 const string& s2,
1250 EList<size_t>& ed,
1251 EList<size_t>& ed2,
1252 bool del,
1253 size_t gap_len,
1254 size_t max_mm,
1255 size_t& gap_pos,
1256 size_t& mm,
1257 bool debug = false)
1258 {
1259 assert_eq(s.length(), s2.length());
1260 size_t seq_len = s.length();
1261 ed.resizeExact(seq_len); ed.fill(max_mm + 1);
1262 ed2.resizeExact(seq_len); ed2.fill(max_mm + 1);
1263
1264 // from left to right
1265 for(size_t i = 0; i < seq_len; i++) {
1266 size_t add = (s[i] == s2[i] ? 0 : 1);
1267 if(i == 0) ed[i] = add;
1268 else ed[i] = ed[i-1] + add;
1269 if(ed[i] >= max_mm + 1) break;
1270 }
1271
1272 // from right to left
1273 size_t s_sub = del ? 0 : gap_len;
1274 size_t s2_sub = del ? gap_len : 0;
1275 for(int i = seq_len - 1; i >= gap_len; i--) {
1276 size_t add = (s[i - s_sub] == s2[i - s2_sub] ? 0 : 1);
1277 if(i == seq_len - 1) ed2[i] = add;
1278 else ed2[i] = ed2[i+1] + add;
1279 if(ed2[i] > max_mm) break;
1280 }
1281
1282 if(debug) {
1283 cout << s << endl << s2 << endl;
1284 for(size_t i = 0; i < ed.size(); i++) {
1285 cout << (ed[i] % 10);
1286 }
1287 cout << endl;
1288 for(size_t i = 0; i < ed2.size(); i++) {
1289 cout << (ed2[i] % 10);
1290 }
1291 cout << endl;
1292 }
1293
1294 size_t min_mm = ed2[gap_len];
1295 int min_mm_i = -1;
1296 assert_eq(ed.size(), ed2.size());
1297 for(size_t i = 0; i + gap_len + 1 < ed.size(); i++) {
1298 if(ed[i] > max_mm) break;
1299 size_t cur_mm = ed[i] + ed2[i + gap_len + 1];
1300 if(cur_mm < min_mm) {
1301 min_mm = cur_mm;
1302 min_mm_i = i;
1303 }
1304 }
1305
1306 mm = min_mm;
1307 if(mm > max_mm)
1308 return;
1309 gap_pos = (size_t)(min_mm_i + 1);
1310 }
1311
align_with_one_gap(const string & s,const EList<pair<size_t,size_t>> & s_kmer_table,const string & s2,EList<size_t> & counts,size_t max_gap,size_t max_mm,size_t & mm,size_t & gap_pos,int & gap_len,size_t k=5)1312 void align_with_one_gap(const string& s,
1313 const EList<pair<size_t, size_t> >& s_kmer_table,
1314 const string& s2,
1315 EList<size_t>& counts,
1316 size_t max_gap,
1317 size_t max_mm,
1318 size_t& mm,
1319 size_t& gap_pos,
1320 int& gap_len,
1321 size_t k = 5)
1322 {
1323 mm = max_mm + 1;
1324 assert_eq(s.length(), s2.length());
1325 counts.resizeExact(max_gap * 2 + 1);
1326 counts.fillZero();
1327 size_t max_count = 0, max_count_i = 0;
1328 for(size_t i = 0; i + k <= s2.length(); i += 1) {
1329 pair<size_t, size_t> kmer(0, 0);
1330 kmer.first = extract_kmer(s2, i, k);
1331 size_t lb = s_kmer_table.bsearchLoBound(kmer);
1332 while(lb < s_kmer_table.size() && kmer.first == s_kmer_table[lb].first) {
1333 int gap = (int)s_kmer_table[lb].second - (int)i;
1334 if(gap != 0 && abs(gap) < max_gap) {
1335 size_t gap_i = (size_t)(gap + max_gap);
1336 counts[gap_i]++;
1337 if(counts[gap_i] > max_count) {
1338 max_count = counts[gap_i];
1339 max_count_i = gap_i;
1340 }
1341 }
1342 lb++;
1343 }
1344 }
1345
1346 if(max_count <= 0)
1347 return;
1348
1349 assert_lt(max_count_i, counts.size());
1350 int gap = (int)max_count_i - (int)max_gap;
1351 assert_neq(gap, 0);
1352 size_t abs_gap = abs(gap);
1353 bool del = gap > 0;
1354
1355 EList<size_t> ed, ed2;
1356 find_gap_pos(s,
1357 s2,
1358 ed,
1359 ed2,
1360 del,
1361 abs_gap,
1362 max_mm,
1363 gap_pos,
1364 mm);
1365 if(mm > max_mm)
1366 return;
1367
1368 gap_len = del ? abs_gap : -abs_gap;
1369
1370 #ifndef NDEBUG
1371 assert_leq(mm, max_mm);
1372 string ds = s, ds2 = s2;
1373 size_t debug_mm = 0;
1374 if(del) ds.erase(gap_pos, abs_gap);
1375 else ds2.erase(gap_pos, abs_gap);
1376
1377 for(size_t i = 0; i < min(ds.length(), ds2.length()); i++) {
1378 if(ds[i] != ds2[i])
1379 debug_mm++;
1380 }
1381 assert_eq(debug_mm, mm);
1382 #endif
1383 }
1384
1385 template<typename TStr>
calc_edit_dist(const TStr & s,EList<SeedExt> & seeds,size_t sb,size_t se,const EList<string> & left_consensuses,const EList<string> & right_consensuses,uint32_t max_ed)1386 void calc_edit_dist(const TStr& s,
1387 EList<SeedExt>& seeds,
1388 size_t sb,
1389 size_t se,
1390 const EList<string>& left_consensuses,
1391 const EList<string>& right_consensuses,
1392 uint32_t max_ed)
1393 {
1394 string left_consensus = left_consensuses[max_ed];
1395 string right_consensus = right_consensuses[max_ed];
1396
1397 EList<pair<size_t, size_t> > left_kmer_table, right_kmer_table;
1398 build_kmer_table(left_consensus, left_kmer_table);
1399 build_kmer_table(right_consensus, right_kmer_table);
1400
1401 string left_seq, right_seq;
1402 const size_t max_gap = 10;
1403 EList<size_t> counts;
1404
1405 size_t left_ext = left_consensus.length();
1406 size_t right_ext = right_consensus.length();
1407 for(size_t i = sb; i < se; i++) {
1408 SeedExt& seed = seeds[i];
1409 if(seed.bound.first + left_ext > seed.pos.first ||
1410 seed.pos.second + right_ext > seed.bound.second) {
1411 seed.ed = max_ed + 1;
1412 continue;
1413 }
1414
1415 size_t left_ed = 0;
1416 if(left_ext > 0) {
1417 getString(s, seed.pos.first - left_ext, left_ext, left_seq);
1418 reverse(left_seq.begin(), left_seq.end());
1419 for(size_t j = 0; j < left_ext; j++) {
1420 if(left_seq[j] != left_consensus[j]) left_ed++;
1421 if(left_ed <= max_ed && j < left_consensuses[left_ed].length()) {
1422 seed.curr_ext_len = j + 1;
1423 }
1424 }
1425
1426 if(left_ed > max_ed) {
1427 size_t gap_pos = 0;
1428 int gap_len = 0;
1429 align_with_one_gap(left_consensus,
1430 left_kmer_table,
1431 left_seq,
1432 counts,
1433 min(left_ed - max_ed, max_gap),
1434 max_ed,
1435 left_ed,
1436 gap_pos,
1437 gap_len);
1438 if(left_ed <= max_ed) {
1439 seed.left_gaps.expand();
1440 seed.left_gaps.back().first = seed.getLeftExtLength() + gap_pos;
1441 seed.left_gaps.back().second = gap_len;
1442 }
1443 }
1444 } else {
1445 left_seq.clear();
1446 }
1447
1448 size_t right_ed = 0;
1449 if(right_ext > 0) {
1450 getString(s, seed.pos.second, right_ext, right_seq);
1451 for(size_t j = 0; j < right_ext; j++) {
1452 if(right_seq[j] != right_consensus[j]) right_ed++;
1453 if(right_ed <= max_ed && j < right_consensuses[right_ed].length()) {
1454 seed.curr_ext_len = j + 1;
1455 }
1456 }
1457 if(right_ed > max_ed) {
1458 size_t gap_pos = 0;
1459 int gap_len = 0;
1460 align_with_one_gap(right_consensus,
1461 right_kmer_table,
1462 right_seq,
1463 counts,
1464 min(right_ed - max_ed, max_gap),
1465 max_ed,
1466 right_ed,
1467 gap_pos,
1468 gap_len);
1469 if(right_ed <= max_ed) {
1470 seed.right_gaps.expand();
1471 seed.right_gaps.back().first = seed.getRightExtLength() + gap_pos;
1472 seed.right_gaps.back().second = gap_len;
1473 }
1474 }
1475 } else {
1476 right_seq.clear();
1477 }
1478
1479 seed.ed = left_ed + right_ed;
1480 }
1481 }
1482
1483 template<typename TStr>
get_consensus_seq(const TStr & s,EList<SeedExt> & seeds,size_t sb,size_t se,size_t min_left_ext,size_t min_right_ext,size_t max_ed,const RepeatParameter & rp,EList<size_t> & ed_seed_nums,EList<string> * left_consensuses,EList<string> * right_consensuses) const1484 void RB_Repeat::get_consensus_seq(const TStr& s,
1485 EList<SeedExt>& seeds,
1486 size_t sb,
1487 size_t se,
1488 size_t min_left_ext,
1489 size_t min_right_ext,
1490 size_t max_ed,
1491 const RepeatParameter& rp,
1492 EList<size_t>& ed_seed_nums,
1493 EList<string>* left_consensuses,
1494 EList<string>* right_consensuses) const
1495 {
1496 assert_lt(sb, se);
1497 assert_geq(se - sb, rp.seed_count);
1498 assert_leq(se, seeds.size());
1499 if(left_consensuses != NULL) {
1500 left_consensuses->clear(); left_consensuses->resize(max_ed + 1);
1501 for(size_t i = 0; i < max_ed + 1; i++) {
1502 (*left_consensuses)[i].clear();
1503 }
1504 }
1505 if(right_consensuses != NULL) {
1506 right_consensuses->clear(); right_consensuses->resize(max_ed + 1);
1507 for(size_t i = 0; i < max_ed + 1; i++) {
1508 (*right_consensuses)[i].clear();
1509 }
1510 }
1511
1512 assert_eq(min_left_ext, min_right_ext);
1513
1514 EList<string> seqs;
1515 seqs.reserveExact(seeds.size());
1516 size_t max_i = 0, max_count = 0;
1517 while(min_left_ext > 0) {
1518 seqs.clear();
1519 max_i = 0;
1520 max_count = 0;
1521 for(size_t i = sb; i < se; i++) {
1522 const SeedExt& seed = seeds[i];
1523 if(seeds[i].bound.first + min_left_ext > seed.pos.first ||
1524 seed.pos.second + min_right_ext > seed.bound.second)
1525 continue;
1526
1527 seqs.expand();
1528 if(min_left_ext > 0) {
1529 getString(s, seed.pos.first - min_left_ext, min_left_ext, seqs.back());
1530 }
1531 if(min_right_ext > 0) {
1532 seqs.back() += getString(s, seed.pos.second, min_right_ext);
1533 }
1534 }
1535 seqs.sort();
1536 for(size_t i = 0; i + max_count < seqs.size();) {
1537 size_t count = 1;
1538 for(size_t j = i + 1; j < seqs.size(); j++) {
1539 if(seqs[i] == seqs[j]) count++;
1540 else break;
1541 }
1542 if(count >= max_count) {
1543 max_count = count;
1544 max_i = i;
1545 }
1546 i = i + count;
1547 }
1548 if(max_count >= rp.seed_count)
1549 break;
1550
1551 min_left_ext--;
1552 min_right_ext--;
1553 }
1554
1555 // update ed
1556 // extend consensus string
1557 ed_seed_nums.resize(max_ed + 1); ed_seed_nums.fillZero();
1558 EList<size_t> next_ed_seed_nums; next_ed_seed_nums.resize(max_ed + 1);
1559
1560 if(max_count < rp.seed_count)
1561 return;
1562
1563 for(size_t i = sb; i < se; i++) seeds[i].ed = 0;
1564 size_t seed_ext_len = 0;
1565 while(seed_ext_len < max(min_left_ext, min_right_ext)) {
1566 // select base to be used for extension
1567 uint8_t left_ext_base = 0;
1568 if(seed_ext_len < min_left_ext) left_ext_base = seqs[max_i][min_left_ext - seed_ext_len - 1];
1569 uint8_t right_ext_base = 0;
1570 if(seed_ext_len < min_right_ext) right_ext_base = seqs[max_i][min_left_ext + seed_ext_len];
1571
1572 // estimate extended ed
1573 next_ed_seed_nums.fillZero();
1574 for(size_t i = sb; i < se; i++) {
1575 uint32_t est_ed = seeds[i].ed;
1576 if(seed_ext_len < min_left_ext) {
1577 if(seeds[i].bound.first + seed_ext_len + 1 <= seeds[i].pos.first) {
1578 TIndexOffU left_pos = seeds[i].pos.first - seed_ext_len - 1;
1579 int ch = getSequenceBase(s, left_pos);
1580 assert_range(0, 3, ch);
1581 if ("ACGT"[ch] != left_ext_base) {
1582 est_ed++;
1583 }
1584 } else {
1585 est_ed = max_ed + 1;
1586 }
1587 }
1588
1589 if(seed_ext_len < min_right_ext) {
1590 TIndexOffU right_pos = seeds[i].pos.second + seed_ext_len;
1591 if(right_pos >= seeds[i].bound.second) {
1592 est_ed = max_ed + 1;
1593 } else {
1594 int ch = getSequenceBase(s, right_pos);
1595 assert_range(0, 3, ch);
1596 if ("ACGT"[ch] != right_ext_base) {
1597 est_ed++;
1598 }
1599 }
1600 }
1601
1602 if(est_ed <= max_ed) {
1603 next_ed_seed_nums[est_ed]++;
1604 }
1605 }
1606
1607 for(size_t i = 1; i < next_ed_seed_nums.size(); i++) next_ed_seed_nums[i] += next_ed_seed_nums[i-1];
1608 if(next_ed_seed_nums[max_ed] < rp.repeat_count) {
1609 break;
1610 }
1611
1612 for(size_t i = sb; i < se; i++) {
1613 if(seed_ext_len < min_left_ext) {
1614 if(seeds[i].bound.first + seed_ext_len + 1 <= seeds[i].pos.first) {
1615 TIndexOffU left_pos = seeds[i].pos.first - seed_ext_len - 1;
1616 int ch = getSequenceBase(s, left_pos);
1617 assert_range(0, 3, ch);
1618 if ("ACGT"[ch] != left_ext_base) {
1619 seeds[i].ed++;
1620 }
1621 } else {
1622 seeds[i].ed = max_ed + 1;
1623 }
1624 }
1625
1626 if(seed_ext_len < min_right_ext) {
1627 TIndexOffU right_pos = seeds[i].pos.second + seed_ext_len;
1628 if(right_pos >= seeds[i].bound.second) {
1629 seeds[i].ed = max_ed + 1;
1630 } else {
1631 int ch = getSequenceBase(s, right_pos);
1632 assert_range(0, 3, ch);
1633 if ("ACGT"[ch] != right_ext_base) {
1634 seeds[i].ed++;
1635 }
1636 }
1637 }
1638 }
1639
1640 for(size_t i = 0; i < next_ed_seed_nums.size(); i++) {
1641 if(next_ed_seed_nums[i] < rp.repeat_count)
1642 continue;
1643 ed_seed_nums[i] = next_ed_seed_nums[i];
1644 if(seed_ext_len < min_left_ext) {
1645 assert(left_consensuses != NULL);
1646 (*left_consensuses)[i] += left_ext_base;
1647 }
1648 if(seed_ext_len < min_right_ext) {
1649 assert(right_consensuses != NULL);
1650 (*right_consensuses)[i] += right_ext_base;
1651 }
1652 }
1653
1654 seed_ext_len++;
1655 }
1656 }
1657
1658 template<typename TStr>
get_consensus_seq(const TStr & s,EList<SeedExt> & seeds,size_t sb,size_t se,size_t min_left_ext,size_t min_right_ext,size_t max_ed,const RepeatParameter & rp,EList<size_t> & ed_seed_nums,EList<string> * left_consensuses,EList<string> * right_consensuses) const1659 void RB_RepeatExt::get_consensus_seq(const TStr& s,
1660 EList<SeedExt>& seeds,
1661 size_t sb,
1662 size_t se,
1663 size_t min_left_ext,
1664 size_t min_right_ext,
1665 size_t max_ed,
1666 const RepeatParameter& rp,
1667 EList<size_t>& ed_seed_nums,
1668 EList<string>* left_consensuses,
1669 EList<string>* right_consensuses) const
1670 {
1671 assert_lt(sb, se);
1672 assert_leq(se, seeds.size());
1673 if(left_consensuses != NULL) {
1674 left_consensuses->clear(); left_consensuses->resize(max_ed + 1);
1675 for(size_t i = 0; i < max_ed + 1; i++) {
1676 (*left_consensuses)[i].clear();
1677 }
1678 }
1679 if(right_consensuses != NULL) {
1680 right_consensuses->clear(); right_consensuses->resize(max_ed + 1);
1681 for(size_t i = 0; i < max_ed + 1; i++) {
1682 (*right_consensuses)[i].clear();
1683 }
1684 }
1685
1686 // cluster sequences
1687 EList<size_t> belongto;
1688 belongto.resizeExact(se - sb);
1689 for(size_t i = 0; i < belongto.size(); i++) belongto[i] = i;
1690
1691 for(size_t i = 0; i + 1 < belongto.size(); i++) {
1692 for(size_t j = i + 1; j < belongto.size(); j++) {
1693 if(belongto[j] != j) continue;
1694 size_t ed = calc_edit_dist(s,
1695 seeds[sb + i],
1696 seeds[sb + j],
1697 min_left_ext,
1698 min_right_ext,
1699 max_ed + 1);
1700 if(ed <= max_ed + 1) {
1701 belongto[j] = belongto[i];
1702 }
1703
1704 }
1705 }
1706
1707 // find the maximum group that has the most sequences
1708 EList<size_t> vote;
1709 vote.resizeExact(seeds.size());
1710 vote.fillZero();
1711 size_t max_group = 0;
1712 for(size_t i = 0; i < belongto.size(); i++) {
1713 size_t cur_group = belongto[i];
1714 assert_lt(cur_group, vote.size());
1715 vote[cur_group]++;
1716 if(cur_group != max_group && vote[cur_group] > vote[max_group]) {
1717 max_group = cur_group;
1718 }
1719 }
1720
1721 // reuse vote to store seeds
1722 EList<size_t>& consensus_group = vote;
1723 consensus_group.clear();
1724 for(size_t i = 0; i < belongto.size(); i++) {
1725 if(belongto[i] == max_group)
1726 consensus_group.push_back(i);
1727 }
1728
1729 // update ed
1730 // extend consensus string
1731 ed_seed_nums.resize(max_ed + 1); ed_seed_nums.fillZero();
1732 EList<size_t> next_ed_seed_nums; next_ed_seed_nums.resize(max_ed + 1);
1733 for(size_t i = sb; i < se; i++) seeds[i].ed = 0;
1734 size_t seed_ext_len = 0;
1735 while(seed_ext_len < max(min_left_ext, min_right_ext)) {
1736 // count base
1737 size_t l_count[4] = {0, };
1738 size_t r_count[4] = {0, };
1739
1740 for(size_t i = 0; i < consensus_group.size(); i++) {
1741 size_t si = sb + consensus_group[i];
1742 int ch;
1743 if(seed_ext_len < min_left_ext) {
1744 if(seeds[i].bound.first + seed_ext_len + 1 <= seeds[i].pos.first) {
1745 ch = getSequenceBase(s, seeds[si].pos.first - seed_ext_len - 1);
1746 assert_range(0, 3, ch);
1747 l_count[ch]++;
1748 }
1749 }
1750 if(seed_ext_len < min_right_ext) {
1751 if(seeds[i].bound.second + seed_ext_len) {
1752 ch = getSequenceBase(s, seeds[si].pos.second + seed_ext_len);
1753 assert_range(0, 3, ch);
1754 r_count[ch]++;
1755 }
1756 }
1757 }
1758
1759 // select base to be used for extension
1760 uint8_t left_ext_base = 0;
1761 if(seed_ext_len < min_left_ext) left_ext_base = max_index(l_count);
1762 uint8_t right_ext_base = 0;
1763 if(seed_ext_len < min_right_ext) right_ext_base = max_index(r_count);
1764
1765 // estimate extended ed
1766 next_ed_seed_nums.fillZero();
1767 for(size_t i = sb; i < se; i++) {
1768 uint32_t est_ed = seeds[i].ed;
1769 if(seed_ext_len < min_left_ext) {
1770 if(seeds[i].bound.first + seed_ext_len + 1 <= seeds[i].pos.first) {
1771 TIndexOffU left_pos = seeds[i].pos.first - seed_ext_len - 1;
1772 int ch = getSequenceBase(s, left_pos);
1773 assert_range(0, 3, ch);
1774 if (ch != left_ext_base) {
1775 est_ed++;
1776 }
1777 } else {
1778 est_ed = max_ed + 1;
1779 }
1780 }
1781
1782 if(seed_ext_len < min_right_ext) {
1783 TIndexOffU right_pos = seeds[i].pos.second + seed_ext_len;
1784 if(right_pos >= seeds[i].bound.second) {
1785 est_ed = max_ed + 1;
1786 } else {
1787 int ch = getSequenceBase(s, right_pos);
1788 assert_range(0, 3, ch);
1789 if (ch != right_ext_base) {
1790 est_ed++;
1791 }
1792 }
1793 }
1794
1795 if(est_ed <= max_ed) {
1796 next_ed_seed_nums[est_ed]++;
1797 }
1798 }
1799
1800 for(size_t i = 1; i < next_ed_seed_nums.size(); i++) next_ed_seed_nums[i] += next_ed_seed_nums[i-1];
1801 if(next_ed_seed_nums[max_ed] < rp.repeat_count) {
1802 break;
1803 }
1804
1805 for(size_t i = sb; i < se; i++) {
1806 if(seed_ext_len < min_left_ext) {
1807 if(seeds[i].bound.first + seed_ext_len + 1 <= seeds[i].pos.first) {
1808 TIndexOffU left_pos = seeds[i].pos.first - seed_ext_len - 1;
1809 int ch = getSequenceBase(s, left_pos);
1810 assert_range(0, 3, ch);
1811 if (ch != left_ext_base) {
1812 seeds[i].ed++;
1813 }
1814 } else {
1815 seeds[i].ed = max_ed + 1;
1816 }
1817 }
1818
1819 if(seed_ext_len < min_right_ext) {
1820 TIndexOffU right_pos = seeds[i].pos.second + seed_ext_len;
1821 if(right_pos >= seeds[i].bound.second) {
1822 seeds[i].ed = max_ed + 1;
1823 } else {
1824 int ch = getSequenceBase(s, right_pos);
1825 assert_range(0, 3, ch);
1826 if (ch != right_ext_base) {
1827 seeds[i].ed++;
1828 }
1829 }
1830 }
1831 }
1832
1833 for(size_t i = 0; i < next_ed_seed_nums.size(); i++) {
1834 if(next_ed_seed_nums[i] < rp.repeat_count)
1835 continue;
1836 ed_seed_nums[i] = next_ed_seed_nums[i];
1837 if(seed_ext_len < min_left_ext) {
1838 assert(left_consensuses != NULL);
1839 (*left_consensuses)[i] += (char)("ACGT"[left_ext_base]);
1840 }
1841 if(seed_ext_len < min_right_ext) {
1842 assert(right_consensuses != NULL);
1843 (*right_consensuses)[i] += (char)("ACGT"[right_ext_base]);
1844 }
1845 }
1846
1847 seed_ext_len++;
1848 }
1849 }
1850
1851 EList<size_t> RB_Repeat::ca_ed_;
1852 EList<size_t> RB_Repeat::ca_ed2_;
1853 string RB_Repeat::ca_s_;
1854 string RB_Repeat::ca_s2_;
1855
1856 size_t RB_Repeat::seed_merge_tried = 0;
1857 size_t RB_Repeat::seed_merged = 0;
1858
1859 template<typename TStr>
extendConsensus(const RepeatParameter & rp,const TStr & s)1860 void RB_RepeatExt::extendConsensus(const RepeatParameter& rp,
1861 const TStr& s)
1862 {
1863 size_t remain = seeds_.size();
1864 EList<string> consensuses, empty_consensuses;
1865 EList<size_t> ed_seed_nums;
1866 bool left = true;
1867
1868 const TIndexOffU default_max_ext_len = 100;
1869 const TIndexOffU seed_mm = 4;
1870 string ext_consensus = "";
1871
1872 empty_consensuses.resize(seed_mm + 1);
1873 empty_consensuses.fill("");
1874
1875 while(remain >= rp.repeat_count) {
1876 for(size_t i = 0; i < remain; i++) {
1877 seeds_[i].done = false;
1878 seeds_[i].curr_ext_len = 0;
1879 }
1880
1881 // extend seeds in left or right direction
1882 TIndexOffU max_ext_len = min(default_max_ext_len, (TIndexOffU)(rp.max_repeat_len - consensus_.length()));
1883 get_consensus_seq(s,
1884 seeds_,
1885 0, // seed begin
1886 remain, // seed end
1887 left ? max_ext_len : 0,
1888 left ? 0 : max_ext_len,
1889 seed_mm,
1890 rp,
1891 ed_seed_nums,
1892 left ? &consensuses : NULL,
1893 left ? NULL : &consensuses);
1894
1895 size_t allowed_seed_mm = 0;
1896 ext_consensus.clear();
1897 for(int i = (int)seed_mm; i >= 0; i--) {
1898 size_t extlen = (ed_seed_nums[i] < rp.repeat_count ? 0 : consensuses[i].length());
1899 if(extlen <= 0 || extlen < max_ext_len * i / seed_mm)
1900 continue;
1901
1902 if(i > 0 && consensuses[i].length() <= consensuses[i-1].length() + 5)
1903 continue;
1904
1905 ext_consensus = consensuses[i];
1906 allowed_seed_mm = (size_t)i;
1907 assert(ext_consensus.length() > 0);
1908 break;
1909 }
1910 size_t num_passed_seeds = 0;
1911 if(ext_consensus.length() > 0) {
1912 if(left) consensus_ = reverse(ext_consensus) + consensus_;
1913 else consensus_ += ext_consensus;
1914
1915 calc_edit_dist(s,
1916 seeds_,
1917 0,
1918 remain,
1919 left ? consensuses : empty_consensuses,
1920 left ? empty_consensuses : consensuses,
1921 allowed_seed_mm);
1922
1923 // update seeds
1924 for(size_t i = 0; i < seeds_.size(); i++) {
1925 SeedExt& seed = seeds_[i];
1926
1927 if(i < remain) {
1928 if(seed.ed <= allowed_seed_mm) {
1929 num_passed_seeds++;
1930 seed.done = true;
1931 seed.total_ed += seed.ed;
1932 if(left) {
1933 if(seed.left_gaps.size() > 0 &&
1934 seed.left_gaps.back().first >= seed.orig_pos.first - seed.pos.first) {
1935 int gap_len = seed.left_gaps.back().second;
1936 seed.pos.first += gap_len;
1937 }
1938 seed.pos.first -= ext_consensus.length();
1939 seed.consensus_pos.first = 0;
1940 seed.consensus_pos.second = consensus_.length();
1941 } else {
1942 if(seed.right_gaps.size() > 0 &&
1943 seed.right_gaps.back().first >= seed.pos.second - seed.orig_pos.second) {
1944 int gap_len = seed.right_gaps.back().second;
1945 seed.pos.second -= gap_len;
1946 }
1947 seed.pos.second += ext_consensus.length();
1948 seed.consensus_pos.second = consensus_.length();
1949 }
1950 } else {
1951 if(left) {
1952 assert_leq(seed.curr_ext_len, ext_consensus.length());
1953 TIndexOffU adjust = ext_consensus.length() - seed.curr_ext_len;
1954 seed.consensus_pos.first += adjust;
1955 seed.consensus_pos.second += ext_consensus.length();
1956 assert_leq(seed.curr_ext_len, seed.pos.first);
1957 seed.pos.first -= seed.curr_ext_len;
1958 } else {
1959 assert_leq(seed.curr_ext_len, ext_consensus.length());
1960 seed.consensus_pos.second += seed.curr_ext_len;
1961 seed.pos.second += seed.curr_ext_len;
1962 }
1963 }
1964 } else {
1965 if(left) {
1966 seed.consensus_pos.first += ext_consensus.length();
1967 seed.consensus_pos.second += ext_consensus.length();
1968 }
1969 }
1970 }
1971
1972 // move up "done" seeds
1973 size_t j = 0;
1974 for(size_t i = 0; i < remain; i++) {
1975 if(!seeds_[i].done) continue;
1976 assert_geq(i, j);
1977 if(i > j) {
1978 SeedExt temp = seeds_[j];
1979 seeds_[j] = seeds_[i];
1980 seeds_[i] = temp;
1981 // Find next "undone" seed
1982 j++;
1983 while(j < i && seeds_[j].done) {
1984 j++;
1985 }
1986 assert(j < remain && !seeds_[j].done);
1987 } else {
1988 j = i + 1;
1989 }
1990 }
1991 }
1992
1993 remain = num_passed_seeds;
1994 if(remain < rp.repeat_count) {
1995 if(left) {
1996 left = false;
1997 remain = seeds_.size();
1998 }
1999 }
2000 } // while(remain >= rp.repeat_count)
2001
2002 #ifndef NDEBUG
2003 // make sure seed positions are unique
2004 EList<pair<TIndexOffU, TIndexOffU> > seed_poses;
2005 for(size_t i = 0; i < seeds_.size(); i++) {
2006 seed_poses.expand();
2007 seed_poses.back() = seeds_[i].orig_pos;
2008 }
2009 seed_poses.sort();
2010 for(size_t i = 0; i + 1 < seed_poses.size(); i++) {
2011 if(seed_poses[i].first == seed_poses[i+1].first) {
2012 assert_lt(seed_poses[i].second, seed_poses[i+1].second);
2013 } else {
2014 assert_lt(seed_poses[i].first, seed_poses[i+1].first);
2015 }
2016 }
2017
2018 for(size_t i = 0; i < seeds_.size(); i++) {
2019 assert(seeds_[i].valid());
2020 }
2021 #endif
2022
2023 internal_update();
2024
2025 // check repeats within a repeat sequence
2026 self_repeat_ = false;
2027 for(size_t i = 0; i + 1 < seed_ranges_.size(); i++) {
2028 const RB_AlleleCoord& range = seed_ranges_[i];
2029 for(size_t j = i + 1; j < seed_ranges_.size(); j++) {
2030 RB_AlleleCoord& range2 = seed_ranges_[j];
2031 if(range.right <= range2.left)
2032 break;
2033 self_repeat_ = true;
2034 }
2035 }
2036 }
2037
overlap(const RB_Repeat & o,bool & contain,bool & left,size_t & seed_i,size_t & seed_j,bool debug) const2038 bool RB_Repeat::overlap(const RB_Repeat& o,
2039 bool& contain,
2040 bool& left,
2041 size_t& seed_i,
2042 size_t& seed_j,
2043 bool debug) const
2044 {
2045 contain = left = false;
2046 seed_i = seed_j = 0;
2047 size_t p = 0, p2 = 0;
2048 while(p < seed_ranges_.size() && p2 < o.seed_ranges_.size()) {
2049 const RB_AlleleCoord& range = seed_ranges_[p];
2050 const SeedExt& seed = seeds_[range.idx];
2051 Range range_ = seed.getExtendedRange(consensus_.length());
2052 RB_AlleleCoord ex_range(range_.first, range_.second, p);
2053 bool representative = (float)range.len() >= consensus_.length() * 0.95f;
2054
2055 const RB_AlleleCoord& range2 = o.seed_ranges_[p2];
2056 const SeedExt& seed2 = o.seeds_[range2.idx];
2057 Range range2_ = seed2.getExtendedRange(o.consensus().length());
2058 RB_AlleleCoord ex_range2(range2_.first, range2_.second, p2);
2059 bool representative2 = (float)range2.len() >= o.consensus_.length() * 0.95f;
2060
2061 seed_i = range.idx;
2062 seed_j = range2.idx;
2063
2064 const size_t relax = 5;
2065 if(representative && representative2) {
2066 if(ex_range.overlap_len(ex_range2) > 0) {
2067 if(ex_range.contain(ex_range2, relax)) {
2068 contain = true;
2069 left = true;
2070 } else if(ex_range2.contain(ex_range, relax)) {
2071 contain = true;
2072 left = false;
2073 } else {
2074 left = ex_range.left <= ex_range2.left;
2075 }
2076 return true;
2077 }
2078 } else if(representative) {
2079 if(range2.contain(ex_range, relax)) {
2080 contain = true;
2081 left = false;
2082 return true;
2083 }
2084 } else if(representative2) {
2085 if(range.contain(ex_range2, relax)) {
2086 contain = true;
2087 left = true;
2088 return true;
2089 }
2090 }
2091
2092 if(range.right <= range2.right) p++;
2093 if(range2.right <= range.right) p2++;
2094 }
2095 return false;
2096 }
2097
showInfo(const RepeatParameter & rp,CoordHelper & coordHelper) const2098 void RB_Repeat::showInfo(const RepeatParameter& rp,
2099 CoordHelper& coordHelper) const
2100 {
2101 cerr << "\trepeat id: " << repeat_id_ << endl;
2102 cerr << "\tnumber of alleles: " << seeds_.size() << endl;
2103 cerr << "\tconsensus length: " << consensus_.length() << endl;
2104 cerr << consensus_ << endl;
2105 for(size_t i = 0; i < seeds_.size(); i++) {
2106 const SeedExt& seed = seeds_[i];
2107 size_t ext_len = seed.getLength();
2108 if(ext_len < rp.min_repeat_len) continue;
2109 bool sense_strand = seed.pos.first < coordHelper.forward_length();
2110
2111 cerr << "\t\t" << setw(4) << i << " " << setw(4) << ext_len;
2112 cerr << " " << (sense_strand ? '+' : '-');
2113 cerr << " " << setw(10) << seed.pos.first << " " << setw(10) << seed.pos.second;
2114 cerr << " " << setw(4) << seed.consensus_pos.first << " " << setw(4) << seed.consensus_pos.second;
2115 cerr << " " << (seed.aligned ? "aligned" : "unaligned");
2116 cerr << endl;
2117 }
2118 }
2119
2120 template<typename TStr>
generateSNPs(const RepeatParameter & rp,const TStr & s,TIndexOffU grp_id)2121 void RB_Repeat::generateSNPs(const RepeatParameter& rp, const TStr& s, TIndexOffU grp_id) {
2122 EList<SeedExt>& seeds = this->seeds();
2123
2124 for(size_t i = 0; i < seeds.size(); i++) {
2125 SeedExt& seed = seeds[i];
2126
2127 if(!seed.aligned) {
2128 continue;
2129 }
2130
2131 if(seed.getLength() < rp.min_repeat_len) {
2132 continue;
2133 }
2134 assert_eq(seed.getLength(), seed.pos.second - seed.pos.first);
2135 seed.generateSNPs(s, consensus(), snps());
2136 }
2137 if(snps().size() > 0) {
2138 sort(snps_.begin(), snps().end(), SeedSNP::cmpSeedSNPByPos);
2139 }
2140 }
2141
saveSNPs(ofstream & fp,TIndexOffU grp_id,TIndexOffU & snp_id_base)2142 void RB_Repeat::saveSNPs(ofstream &fp, TIndexOffU grp_id, TIndexOffU& snp_id_base)
2143 {
2144 string chr_name = "rep" + to_string(repeat_id_);
2145
2146 for(size_t j = 0; j < snps_.size(); j++) {
2147 SeedSNP& snp = *snps_[j];
2148 // assign SNPid
2149 snp.id = (snp_id_base++);
2150
2151 fp << "rps" << snp.id;
2152 fp << "\t";
2153
2154 if(snp.type == EDIT_TYPE_MM) {
2155 fp << "single";
2156 } else if(snp.type == EDIT_TYPE_READ_GAP) {
2157 fp << "deletion";
2158 } else if(snp.type == EDIT_TYPE_REF_GAP) {
2159 fp << "insertion";
2160 } else {
2161 assert(false);
2162 }
2163 fp << "\t";
2164
2165 fp << chr_name;
2166 fp << "\t";
2167
2168 fp << snp.pos;
2169 fp << "\t";
2170 if(snp.type == EDIT_TYPE_MM || snp.type == EDIT_TYPE_REF_GAP) {
2171 fp << snp.base;
2172 } else if(snp.type == EDIT_TYPE_READ_GAP) {
2173 fp << snp.len;
2174 } else {
2175 assert(false);
2176 }
2177 fp << endl;
2178 }
2179 }
2180
mergeable(const RB_Repeat & o) const2181 float RB_RepeatExt::mergeable(const RB_Repeat& o) const
2182 {
2183 const EList<RB_AlleleCoord>& ranges = seed_ranges();
2184 const EList<RB_AlleleCoord>& ranges2 = o.seed_ranges();
2185 size_t num_overlap_bp = 0;
2186 size_t p = 0, p2 = 0;
2187 while(p < ranges.size() && p2 < ranges2.size()) {
2188 const RB_AlleleCoord& range = ranges[p];
2189 const RB_AlleleCoord& range2 = ranges2[p2];
2190 TIndexOffU overlap = range.overlap_len(range2);
2191 num_overlap_bp += overlap;
2192 if(range.right <= range2.right) p++;
2193 else p2++;
2194 }
2195 size_t num_total_bp = 0, num_total_bp2 = 0;
2196 for(size_t r = 0; r < ranges.size(); r++) num_total_bp += (ranges[r].right - ranges[r].left);
2197 for(size_t r = 0; r < ranges2.size(); r++) num_total_bp2 += (ranges2[r].right - ranges2[r].left);
2198 float portion = float(num_overlap_bp) / float(min(num_total_bp, num_total_bp2));
2199 return portion;
2200 }
2201
get_next_range(const EList<int> & offsets,size_t i,Range & r,float & avg)2202 inline void get_next_range(const EList<int>& offsets, size_t i, Range& r, float& avg)
2203 {
2204 r.first = i;
2205 for(; r.first < offsets.size() && offsets[r.first] < 0; r.first++);
2206 r.second = r.first + 1;
2207 avg = 0.0f;
2208 if(r.first < offsets.size()) {
2209 float diff = (float)offsets[r.first] - (float)r.first;
2210 avg += diff;
2211 }
2212 for(; r.second < offsets.size() &&
2213 offsets[r.second] >= 0 &&
2214 (r.second == 0 || offsets[r.second] >= offsets[r.second - 1]);
2215 r.second++) {
2216 float diff = (float)offsets[r.second] - (float)r.second;
2217 avg += diff;
2218 }
2219 avg /= (float)(r.second - r.first);
2220 return;
2221 }
2222
2223 template<typename TStr>
align(const RepeatParameter & rp,const TStr & ref,const string & s,const EList<pair<size_t,size_t>> & s_kmer_table,const string & s2,EList<int> & offsets,size_t k,SeedExt & seed,int consensus_approx_left,int consensus_approx_right,size_t left,size_t right,bool debug)2224 bool RB_RepeatExt::align(const RepeatParameter& rp,
2225 const TStr& ref,
2226 const string& s,
2227 const EList<pair<size_t, size_t> >& s_kmer_table,
2228 const string& s2,
2229 EList<int>& offsets,
2230 size_t k,
2231 SeedExt& seed,
2232 int consensus_approx_left,
2233 int consensus_approx_right,
2234 size_t left,
2235 size_t right,
2236 bool debug)
2237 {
2238 RB_Repeat::seed_merge_tried++;
2239
2240 seed.reset();
2241 seed.pos.first = left;
2242 seed.pos.second = right;
2243 seed.orig_pos.first = seed.pos.first;
2244 seed.orig_pos.second = seed.orig_pos.first;
2245 seed.consensus_pos.first = 0;
2246 seed.consensus_pos.second = consensus_.length();
2247 seed.aligned = false;
2248
2249 {
2250 const int query_len = right - left;
2251 const int consensus_len = consensus_approx_right - consensus_approx_left;
2252 const int abs_gap_len = max<int>(abs(consensus_len - query_len), 5);
2253
2254 offsets.resize(s2.length());
2255 offsets.fill(-1);
2256 pair<size_t, size_t> kmer(0, 0);
2257 for(size_t i = 0; i + k <= s2.length(); i++) {
2258 if(i == 0) {
2259 kmer.first = extract_kmer(s2, i, k);
2260 } else {
2261 kmer.first = next_kmer(kmer.first, s2[i+k-1], k);
2262 }
2263 size_t lb = s_kmer_table.bsearchLoBound(kmer);
2264 while(lb < s_kmer_table.size() && kmer.first == s_kmer_table[lb].first) {
2265 int expected = (int)i + consensus_approx_left;
2266 int real = (int)s_kmer_table[lb].second;
2267 int abs_diff = abs(expected - real);
2268 if(abs_diff <= abs_gap_len * 2 || debug) {
2269 if(offsets[i] == -1) {
2270 offsets[i] = (int)s_kmer_table[lb].second;
2271 } else if(offsets[i] >= 0) {
2272 offsets[i] = -2;
2273 } else {
2274 assert_lt(offsets[i], -1);
2275 offsets[i] -= 1;
2276 }
2277 }
2278 lb++;
2279 }
2280 if(offsets[i] > 0 && i + k == s2.length()) {
2281 for(size_t j = i + 1; j < s2.length(); j++) {
2282 offsets[j] = offsets[j-1] + 1;
2283 }
2284 }
2285 }
2286 }
2287
2288 if(debug) {
2289 cerr << "initial offsets" << endl;
2290 for(size_t j = 0; j < offsets.size(); j++) {
2291 cout << j << ": " << offsets[j] << " " << s2[j] << ": " << (offsets[j] < 0 ? ' ' : s[offsets[j]]) << endl;
2292 }
2293 }
2294
2295 // remove inconsistent positions
2296 Range range; float range_avg;
2297 get_next_range(offsets, 0, range, range_avg);
2298 while(range.second < offsets.size()) {
2299 Range range2; float range_avg2;
2300 get_next_range(offsets, range.second, range2, range_avg2);
2301 if(range2.first >= offsets.size())
2302 break;
2303
2304 assert_leq(range.second, range2.first);
2305
2306 float abs_diff = abs(range_avg - range_avg2);
2307 if(offsets[range.second - 1] > offsets[range2.first] ||
2308 (abs_diff > 10.0f && (range.second - range.first < 5 || range2.second - range2.first < 5)) ||
2309 abs_diff > 20.0f) {
2310 if(range.second - range.first < range2.second - range2.first) {
2311 for(size_t i = range.first; i < range.second; i++) {
2312 offsets[i] = -1;
2313 }
2314 range = range2;
2315 range_avg = range_avg2;
2316 } else {
2317 for(size_t i = range2.first; i < range2.second; i++) {
2318 offsets[i] = -1;
2319 }
2320 }
2321 } else {
2322 range = range2;
2323 range_avg = range_avg2;
2324 }
2325 }
2326
2327 bool weighted_avg_inited = false;
2328 float weighted_avg = -1.0f;
2329 for(size_t i = 0; i < offsets.size(); i++) {
2330 if(offsets[i] < 0)
2331 continue;
2332
2333 float diff = (float)offsets[i] - (float)i;
2334 if(weighted_avg_inited) {
2335 if(abs(diff - weighted_avg) > 20.0f) {
2336 offsets[i] = -1;
2337 continue;
2338 }
2339 }
2340
2341 if(weighted_avg < 0.0f) {
2342 weighted_avg = diff;
2343 weighted_avg_inited = true;
2344 } else {
2345 weighted_avg = 0.8f * weighted_avg + 0.2f * diff;
2346 }
2347 }
2348
2349 if(debug) {
2350 cerr << "after filtering" << endl;
2351 for(size_t j = 0; j < offsets.size(); j++) {
2352 cout << j << ": " << offsets[j] << " " << s2[j] << ": " << (offsets[j] < 0 ? ' ' : s[offsets[j]]) << endl;
2353 }
2354 }
2355
2356 int i = 0;
2357 while(i < offsets.size()) {
2358 // skip non-negative offsets
2359 for(; i < offsets.size() && offsets[i] >= 0; i++);
2360 if(i >= offsets.size()) break;
2361 int j = i;
2362 for(; j < offsets.size(); j++) {
2363 if(offsets[j] >= 0)
2364 break;
2365 }
2366 assert(i >= offsets.size() || offsets[i] < 0);
2367 assert(j >= offsets.size() || offsets[j] >= 0);
2368 if(i > 0 && j < offsets.size()) {
2369 i -= 1;
2370 int left = offsets[i], right = offsets[j];
2371 assert_geq(left, 0);
2372 if(left > right) return false;
2373 assert_leq(left, right);
2374 int ref_len = right - left + 1;
2375 int query_len = j - i + 1;
2376 if(query_len == ref_len) { // match
2377 for(size_t i2 = i + 1; i2 < j; i2++)
2378 offsets[i2] = offsets[i2-1] + 1;
2379 } else { // deletion or insertion
2380 bool del = query_len < ref_len;
2381 size_t gap_len = del ? ref_len - query_len : query_len - ref_len;
2382 size_t max_len = max(ref_len, query_len);
2383 const size_t max_mm = max_len / 25 + 1;
2384 const size_t very_max_mm = max<size_t>(max_len / 2, max_mm);
2385 ca_s_ = s.substr(left, max_len);
2386 ca_s2_ = s2.substr(i, max_len);
2387
2388 size_t gap_pos = 0, mm = very_max_mm + 1;
2389 find_gap_pos(ca_s_,
2390 ca_s2_,
2391 ca_ed_,
2392 ca_ed2_,
2393 del,
2394 gap_len,
2395 very_max_mm,
2396 gap_pos,
2397 mm,
2398 debug);
2399
2400 if(mm > very_max_mm) {
2401 return false;
2402 }
2403
2404 assert_lt(gap_pos, query_len);
2405 if(del) {
2406 for(size_t i2 = i + 1; i2 < j; i2++) {
2407 if(i2 - i == gap_pos) {
2408 offsets[i2] = offsets[i2-1] + gap_len;
2409 } else {
2410 offsets[i2] = offsets[i2-1] + 1;
2411 }
2412 }
2413 } else {
2414 for(size_t i2 = i + 1; i2 < j; i2++) {
2415 if(i2 - i >= gap_pos && i2 - i < gap_pos + gap_len) {
2416 offsets[i2] = offsets[i2-1];
2417 } else {
2418 offsets[i2] = offsets[i2-1] + 1;
2419 }
2420 }
2421 }
2422 }
2423 }
2424
2425 i = j;
2426 }
2427
2428 if(debug) {
2429 cerr << "final offsets" << endl;
2430 for(size_t j = 0; j < offsets.size(); j++) {
2431 cout << j << ": " << offsets[j] << " " << s2[j] << ": " << (offsets[j] < 0 ? ' ' : s[offsets[j]]) << endl;
2432 }
2433 }
2434
2435 #ifndef NDEBUG
2436 for(size_t i = 1; i < offsets.size(); i++) {
2437 if(offsets[i-1] < 0 || offsets[i] < 0) continue;
2438 assert_leq(offsets[i-1], offsets[i]);
2439 }
2440 #endif
2441
2442 size_t b, e;
2443 for(b = 0; b < offsets.size() && offsets[b] < 0; b++);
2444 if(b >= offsets.size()) return false;
2445 assert_lt((size_t)b, offsets.size());
2446 for(e = offsets.size() - 1; e > b && offsets[e] < 0; e--);
2447 if(b == e) return false;
2448 for(size_t p = b; p <= e; p++) {
2449 if(offsets[p] < 0) return false;
2450 }
2451
2452 // try to fill the ends
2453 if(b > 0) {
2454 int mm = 0, pb = (int)b;
2455 for(int i = pb - 1; i >= 0; i--) {
2456 assert_geq(offsets[i+1], 0);
2457 if(offsets[i+1] == 0) break;
2458 if(s2[i] != s[offsets[i+1] - 1])
2459 mm++;
2460 if(pb - i < 25 * (mm - 1))
2461 break;
2462 offsets[i] = offsets[i+1] - 1;
2463 b = (size_t)i;
2464 }
2465 }
2466 if(e + 1 < offsets.size()) {
2467 int mm = 0, prev_end = (int)e;
2468 for(int i = prev_end + 1; i < offsets.size(); i++) {
2469 assert_geq(offsets[i-1], 0);
2470 if(offsets[i-1] + 1 >= s.length()) break;
2471 if(s2[i] != s[offsets[i-1] + 1])
2472 mm++;
2473 if(i - prev_end < 25 * (mm - 1))
2474 break;
2475 offsets[i] = offsets[i-1] + 1;
2476 e = (size_t)i;
2477 }
2478 }
2479
2480 assert_geq(seed.pos.first, (TIndexOffU)b);
2481 seed.pos.first += (TIndexOffU)b;
2482 seed.pos.second = seed.pos.first + e - b + 1;
2483 seed.orig_pos.first = seed.pos.first;
2484 seed.orig_pos.second = seed.pos.first;
2485 seed.consensus_pos.first = offsets[b];
2486 seed.consensus_pos.second = offsets[e] + 1;
2487 seed.aligned = true;
2488
2489 for(int p = b; p < e; p++) {
2490 assert_geq(offsets[p], 0);
2491 assert_leq(offsets[p], offsets[p+1]);
2492 if(offsets[p] + 1 == offsets[p+1]) { // match
2493 continue;
2494 } else if(offsets[p] + 1 < offsets[p+1]) { // deletion
2495 seed.right_gaps.expand();
2496 seed.right_gaps.back().first = p + 1 - b;
2497 seed.right_gaps.back().second = offsets[p+1] - offsets[p] - 1;
2498 } else {
2499 assert_eq(offsets[p], offsets[p+1]);
2500 int p2 = p + 1;
2501 for(; offsets[p2] == offsets[p2+1]; p2++);
2502 seed.right_gaps.expand();
2503 seed.right_gaps.back().first = p + 1 - b;
2504 seed.right_gaps.back().second = (int)p - (int)p2;
2505 p = p2;
2506 }
2507 }
2508
2509 if(debug) {
2510 string consensus_str = consensus_.substr(seed.consensus_pos.first, seed.consensus_pos.second - seed.consensus_pos.first);
2511 string seed_str;
2512 seed.getExtendedSeedSequence(ref, seed_str);
2513 assert_eq(consensus_str.length(), seed_str.length());
2514 cerr << "consensus: " << consensus_str << endl;
2515 cerr << "seed : " << seed_str << endl;
2516 }
2517
2518 RB_Repeat::seed_merged++;
2519
2520 return true;
2521 }
2522
isSelfRepeat(const RepeatParameter & rp,const string & s,const EList<pair<size_t,size_t>> & s_kmer_table,EList<int> & offsets,size_t k,bool debug)2523 bool RB_RepeatExt::isSelfRepeat(const RepeatParameter& rp,
2524 const string& s,
2525 const EList<pair<size_t, size_t> >& s_kmer_table,
2526 EList<int>& offsets,
2527 size_t k,
2528 bool debug)
2529 {
2530 offsets.resize(s.length());
2531 offsets.fill(-1);
2532 pair<size_t, size_t> kmer(0, 0);
2533 for(size_t i = 0; i + k <= s.length(); i++) {
2534 if(i == 0) {
2535 kmer.first = extract_kmer(s, i, k);
2536 } else {
2537 kmer.first = next_kmer(kmer.first, s[i+k-1], k);
2538 }
2539 size_t lb = s_kmer_table.bsearchLoBound(kmer);
2540 while(lb < s_kmer_table.size() && kmer.first == s_kmer_table[lb].first) {
2541 if(offsets[i] == -1) {
2542 offsets[i] = (int)s_kmer_table[lb].second;
2543 } else if(offsets[i] >= 0) {
2544 offsets[i] = -2;
2545 } else {
2546 assert_lt(offsets[i], -1);
2547 offsets[i] -= 1;
2548 }
2549 lb++;
2550 }
2551 if(offsets[i] > 0 && i + k == s.length()) {
2552 for(size_t j = i + 1; j < s.length(); j++) {
2553 offsets[j] = offsets[j-1] + 1;
2554 }
2555 }
2556 }
2557
2558 if(debug) {
2559 cerr << "offsets" << endl;
2560 for(size_t j = 0; j < offsets.size(); j++) {
2561 cout << j << ": " << offsets[j] << " " << s[j] << ": " << (offsets[j] < 0 ? ' ' : s[offsets[j]]) << endl;
2562 }
2563 }
2564
2565 const size_t min_repeat_size = 100;
2566 size_t repeat_count = 0;
2567 for(size_t i = 0; i + min_repeat_size < offsets.size(); i++) {
2568 if(offsets[i] >= -1)
2569 continue;
2570 size_t j = i + 1;
2571 for(; j < offsets.size() && offsets[j] <= -2; j++);
2572 if(j - i >= min_repeat_size)
2573 repeat_count++;
2574 i = j;
2575 }
2576
2577 return repeat_count >= 2;
2578 }
2579
2580 template<typename TStr>
merge(const RepeatParameter & rp,const TStr & s,RB_SWAligner & swalginer,const RB_RepeatExt & o,bool contain,size_t seed_i,size_t seed_j,bool debug)2581 bool RB_RepeatExt::merge(const RepeatParameter& rp,
2582 const TStr& s,
2583 RB_SWAligner& swalginer,
2584 const RB_RepeatExt& o,
2585 bool contain,
2586 size_t seed_i,
2587 size_t seed_j,
2588 bool debug)
2589 {
2590 // construct a new consensus sequence
2591 string prev_consensus = consensus_;
2592 size_t consensus_add_len = 0;
2593 {
2594 assert_lt(seed_i, seeds_.size());
2595 assert_lt(seed_j, o.seeds_.size());
2596 const SeedExt& seed = seeds_[seed_i];
2597 const SeedExt& oseed = o.seeds_[seed_j];
2598
2599 Range range = seed.getExtendedRange(consensus_.length());
2600 Range orange = oseed.getExtendedRange(o.consensus_.length());
2601 assert_leq(range.first, orange.first + 10);
2602
2603 consensus_add_len = (int)orange.first - (int)range.first;
2604 if(!contain) {
2605 if(range.second <= orange.first) {
2606 cerr << "something wrong: " << range.first << "-" << range.second << " ";
2607 cerr << orange.first << "-" << orange.second << " " << o.consensus_.length() << endl;
2608 assert(false);
2609 }
2610 if(range.second > orange.first &&
2611 range.second - orange.first < o.consensus_.length()) {
2612 consensus_ += o.consensus_.substr(range.second - orange.first);
2613 }
2614 }
2615 }
2616
2617 EList<pair<size_t, size_t> > merge_list;
2618 merge_list.reserveExact(o.seed_ranges_.size());
2619 size_t p = 0, p2 = 0;
2620 const size_t relax = 5;
2621 while(p < seed_ranges_.size() && p2 < o.seed_ranges_.size()) {
2622 const RB_AlleleCoord& range = seed_ranges_[p];
2623 assert_lt(range.idx, seeds_.size());
2624 const RB_AlleleCoord& range2 = o.seed_ranges_[p2];
2625 assert_lt(range2.idx, o.seeds_.size());
2626 if(range.contain(range2, relax)) {
2627 merge_list.expand();
2628 merge_list.back().first = p;
2629 merge_list.back().second = p2;
2630 if(debug) {
2631 cerr << p << ":" << range.left << "-" << range.right << " > ";
2632 cerr << p2 << ":" << range2.left << "-" << range2.right << endl;
2633 }
2634 } else if(range2.contain(range, relax)) {
2635 merge_list.expand();
2636 merge_list.back().first = p;
2637 merge_list.back().second = p2;
2638 if(debug) {
2639 cerr << p << ":" << range.left << "-" << range.right << " < ";
2640 cerr << p2 << ":" << range2.left << "-" << range2.right << endl;
2641 }
2642 } else {
2643 TIndexOffU overlap_len = range.overlap_len(range2);
2644 bool stored = !merge_list.empty() && merge_list.back().first == p;
2645 bool stored2 = !merge_list.empty() && merge_list.back().second == p2;
2646 if(overlap_len > 0) {
2647 if(!stored && !stored2) {
2648 merge_list.expand();
2649 merge_list.back().first = p;
2650 merge_list.back().second = p2;
2651
2652 if(debug) {
2653 cerr << p << ":" << range.left << "-" << range.right << " ol ";
2654 cerr << p2 << ":" << range2.left << "-" << range2.right << endl;
2655 }
2656 }
2657 } else {
2658 if(range2.right <= range.left) {
2659 if(!stored2) {
2660 merge_list.expand();
2661 merge_list.back().first = numeric_limits<size_t>::max();
2662 merge_list.back().second = p2;
2663
2664 if(debug) {
2665 cerr << p << ":" << range.left << "-" << range.right << " <> ";
2666 cerr << p2 << ":" << range2.left << "-" << range2.right << endl;
2667 }
2668 }
2669 } else {
2670 assert_leq(range.right, range2.left);
2671 if(!stored) {
2672 merge_list.expand();
2673 merge_list.back().first = p;
2674 merge_list.back().second = numeric_limits<size_t>::max();
2675
2676 if(debug) {
2677 cerr << p << ":" << range.left << "-" << range.right << " <> ";
2678 cerr << p2 << ":" << range2.left << "-" << range2.right << endl;
2679 }
2680 }
2681 }
2682 }
2683 }
2684 if(range.right <= range2.right) p++;
2685 if(range2.right <= range.right) p2++;
2686 }
2687 assert(p == seeds_.size() || p2 == o.seeds_.size());
2688
2689 while(p < seed_ranges_.size()) {
2690 bool stored = !merge_list.empty() && merge_list.back().first == p;
2691 if(!stored) {
2692 const RB_AlleleCoord& range = seed_ranges_[p];
2693 merge_list.expand();
2694 merge_list.back().first = p;
2695 merge_list.back().second = numeric_limits<size_t>::max();
2696
2697 if(debug) {
2698 cerr << p << ":" << range.left << "-" << range.right << " : <> " << endl;
2699 }
2700 }
2701
2702 p++;
2703 }
2704
2705 while(p2 < o.seed_ranges_.size()) {
2706 bool stored2 = !merge_list.empty() && merge_list.back().second == p2;
2707 if(!stored2) {
2708 const RB_AlleleCoord& range2 = o.seed_ranges_[p2];
2709 merge_list.expand();
2710 merge_list.back().first = numeric_limits<size_t>::max();
2711 merge_list.back().second = p2;
2712
2713 if(debug) {
2714 cerr << ": <> " << p2 << ":" << range2.left << "-" << range2.right << endl;
2715 }
2716 }
2717
2718 p2++;
2719 }
2720
2721 assert(!merge_list.empty());
2722
2723 if(debug) {
2724 for(size_t i = 0; i < merge_list.size(); i++) {
2725 cerr << "merge list:" << endl;
2726 cerr << "\t" << (merge_list[i].first < seed_ranges_.size() ? (int)merge_list[i].first : -1);
2727 cerr << " " << (merge_list[i].second < o.seed_ranges_.size() ? (int)merge_list[i].second : -1) << endl;
2728 }
2729 }
2730
2731 const size_t kmer_len = 12;
2732 EList<pair<size_t, size_t> > kmer_table;
2733 build_kmer_table(consensus_, kmer_table, kmer_len);
2734 EList<int> offsets;
2735
2736 bool self_repeat = isSelfRepeat(rp,
2737 consensus_,
2738 kmer_table,
2739 offsets,
2740 kmer_len,
2741 debug);
2742 if(self_repeat) {
2743 consensus_ = prev_consensus;
2744 return false;
2745 }
2746
2747 string seq;
2748 for(size_t i = 0; i < merge_list.size(); i++) {
2749 size_t seed_id = (merge_list[i].first < seed_ranges_.size() ? seed_ranges_[merge_list[i].first].idx : merge_list[i].first);
2750 size_t oseed_id = (merge_list[i].second < o.seed_ranges_.size() ? o.seed_ranges_[merge_list[i].second].idx : merge_list[i].second);
2751
2752 if(seed_id < seeds_.size()) {
2753 if(oseed_id >= o.seeds_.size())
2754 continue;
2755 else if(seed_ranges_[merge_list[i].first].contain(o.seed_ranges_[merge_list[i].second]))
2756 continue;
2757 }
2758
2759 const SeedExt* seed = (seed_id < seeds_.size() ? &seeds_[seed_id] : NULL);
2760 const SeedExt* oseed = (oseed_id < o.seeds_.size() ? &o.seeds_[oseed_id] : NULL);
2761 assert(seed != NULL || oseed != NULL);
2762
2763 size_t left = (seed != NULL ? seed->pos.first : numeric_limits<size_t>::max());
2764 size_t right = (seed != NULL ? seed->pos.second : 0);
2765 int consensus_approx_left = (seed != NULL ? seed->consensus_pos.first : numeric_limits<int>::max());
2766 int consensus_approx_right = (seed != NULL ? seed->consensus_pos.second : 0);
2767 if(oseed != NULL) {
2768 if(oseed->pos.first < left) {
2769 left = oseed->pos.first;
2770 }
2771 if(oseed->pos.second > right) {
2772 right = oseed->pos.second;
2773 }
2774
2775 int oconsensus_approx_left = oseed->consensus_pos.first + consensus_add_len;
2776 if(oconsensus_approx_left < consensus_approx_left) {
2777 consensus_approx_left = oconsensus_approx_left;
2778 }
2779 int oconsensus_approx_right = oseed->consensus_pos.second + consensus_add_len;
2780 if(oconsensus_approx_right > consensus_approx_right) {
2781 consensus_approx_right = oconsensus_approx_right;
2782 }
2783 }
2784
2785 getString(s, left, right - left, seq);
2786 if(seed_id >= seeds_.size()) {
2787 seed_id = seeds_.size();
2788 seeds_.expand();
2789 }
2790
2791 /* bool succ = */ align(rp,
2792 s,
2793 consensus_,
2794 kmer_table,
2795 seq,
2796 offsets,
2797 kmer_len,
2798 seeds_[seed_id],
2799 consensus_approx_left,
2800 consensus_approx_right,
2801 left,
2802 right,
2803 debug && right - left == 1080);
2804 }
2805
2806 while(true) {
2807 internal_update();
2808
2809 size_t remove_count = 0;
2810 for(size_t i = 0; i + 1 < seed_ranges_.size(); i++) {
2811 RB_AlleleCoord& range = seed_ranges_[i];
2812 if(range.left == numeric_limits<size_t>::max())
2813 break;
2814 for(size_t j = i + 1; j < seed_ranges_.size(); j++) {
2815 RB_AlleleCoord& range2 = seed_ranges_[j];
2816 if(range2.left == numeric_limits<size_t>::max())
2817 break;
2818 if(range.right <= range2.left)
2819 break;
2820
2821 getString(s, range.left, range2.right - range.left, seq);
2822
2823 /* bool succ = */ align(rp,
2824 s,
2825 consensus_,
2826 kmer_table,
2827 seq,
2828 offsets,
2829 kmer_len,
2830 seeds_[range.idx],
2831 seeds_[range.idx].consensus_pos.first,
2832 seeds_[range2.idx].consensus_pos.second,
2833 range.left,
2834 range2.right,
2835 debug && range.left == 692422);
2836
2837 range.left = seeds_[range.idx].pos.first;
2838 range.right = seeds_[range.idx].pos.second;
2839 seeds_[range2.idx].reset();
2840 range2.left = numeric_limits<size_t>::max();
2841 remove_count++;
2842 }
2843 }
2844
2845 if(remove_count <= 0) break;
2846
2847 sort(seeds_.begin(), seeds_.end(), seedCmp);
2848 seeds_.resize(seeds_.size() - remove_count);
2849 }
2850
2851 return true;
2852 }
2853
2854 #define DMAX std::numeric_limits<double>::max()
2855
RB_SWAligner()2856 RB_SWAligner::RB_SWAligner()
2857 {
2858 rnd_.init(0);
2859 }
2860
~RB_SWAligner()2861 RB_SWAligner::~RB_SWAligner()
2862 {
2863 if(sc_) {
2864 delete sc_;
2865 }
2866 }
2867
init_dyn(const RepeatParameter & rp)2868 void RB_SWAligner::init_dyn(const RepeatParameter& rp)
2869 {
2870 const int MM_PEN = 3;
2871 // const int MM_PEN = 6;
2872 const int GAP_PEN_LIN = 2;
2873 // const int GAP_PEN_LIN = (((MM_PEN) * rpt_edit_ + 1) * 1.0);
2874 const int GAP_PEN_CON = 4;
2875 // const int GAP_PEN_CON = (((MM_PEN) * rpt_edit_ + 1) * 1.0);
2876 const int MAX_PEN = MAX_I16;
2877
2878 scoreMin_.init(SIMPLE_FUNC_LINEAR, rp.max_edit * MM_PEN * -1.0, 0.0);
2879 nCeil_.init(SIMPLE_FUNC_LINEAR, 0.0, 0.0);
2880
2881 penCanIntronLen_.init(SIMPLE_FUNC_LOG, -8, 1);
2882 penNoncanIntronLen_.init(SIMPLE_FUNC_LOG, -8, 1);
2883
2884 sc_ = new Scoring(
2885 DEFAULT_MATCH_BONUS, // constant reward for match
2886 DEFAULT_MM_PENALTY_TYPE, // how to penalize mismatches
2887 MM_PEN, // max mm penalty
2888 MM_PEN, // min mm penalty
2889 MAX_PEN, // max sc penalty
2890 MAX_PEN, // min sc penalty
2891 scoreMin_, // min score as function of read len
2892 nCeil_, // max # Ns as function of read len
2893 DEFAULT_N_PENALTY_TYPE, // how to penalize Ns in the read
2894 DEFAULT_N_PENALTY, // constant if N pelanty is a constant
2895 DEFAULT_N_CAT_PAIR, // whether to concat mates before N filtering
2896
2897
2898 GAP_PEN_CON, // constant coeff for read gap cost
2899 GAP_PEN_CON, // constant coeff for ref gap cost
2900 GAP_PEN_LIN, // linear coeff for read gap cost
2901 GAP_PEN_LIN, // linear coeff for ref gap cost
2902 1 /* gGapBarrier */ // # rows at top/bot only entered diagonally
2903 );
2904 }
2905
makePadString(const string & ref,const string & read,string & pad,size_t len)2906 void RB_SWAligner::makePadString(const string& ref,
2907 const string& read,
2908 string& pad,
2909 size_t len)
2910 {
2911 pad.resize(len);
2912
2913 for(size_t i = 0; i < len; i++) {
2914 // shift A->C, C->G, G->T, T->A
2915 pad[i] = "CGTA"[asc2dna[ref[i]]];
2916
2917 if(read[i] == pad[i]) {
2918 // shift
2919 pad[i] = "CGTA"[asc2dna[pad[i]]];
2920 }
2921 }
2922
2923 int head_len = len / 2;
2924 size_t pad_start = len - head_len;
2925
2926 for(size_t i = 0; i < head_len; i++) {
2927 if(read[i] == pad[pad_start + i]) {
2928 // shift
2929 pad[pad_start + i] = "CGTA"[asc2dna[pad[pad_start + i]]];
2930 }
2931 }
2932 }
2933
alignStrings(const string & ref,const string & read,EList<Edit> & edits,Coord & coord)2934 int RB_SWAligner::alignStrings(const string &ref,
2935 const string &read,
2936 EList<Edit>& edits,
2937 Coord& coord)
2938 {
2939 // Prepare Strings
2940
2941 // Read -> BTDnaString
2942 // Ref -> bit-encoded string
2943
2944 //SwAligner swa;
2945
2946 BTDnaString btread;
2947 BTString btqual;
2948 BTString btref;
2949 BTString btref2;
2950
2951 BTDnaString btreadrc;
2952 BTString btqualrc;
2953
2954
2955 string qual = "";
2956 for(size_t i = 0; i < read.length(); i++) {
2957 qual.push_back('I');
2958 }
2959
2960 #if 0
2961 cerr << "REF : " << ref << endl;
2962 cerr << "READ: " << read << endl;
2963 cerr << "QUAL: " << qual << endl;
2964 #endif
2965
2966 btread.install(read.c_str(), true);
2967 btreadrc = btread;
2968 btreadrc.reverseComp();
2969
2970 btqual.install(qual.c_str());
2971 btqualrc = btqual;
2972
2973 btref.install(ref.c_str());
2974
2975 TAlScore min_score = sc_->scoreMin.f<TAlScore >((double)btread.length());
2976
2977 btref2 = btref;
2978
2979 size_t nceil = 0;
2980 // size_t nrow = btread.length();
2981
2982 // Convert reference string to mask
2983 for(size_t i = 0; i < btref2.length(); i++) {
2984 if(toupper(btref2[i]) == 'N') {
2985 btref2.set(16, i);
2986 } else {
2987 int num = 0;
2988 int alts[] = {4, 4, 4, 4};
2989 decodeNuc(toupper(btref2[i]), num, alts);
2990 assert_leq(num, 4);
2991 assert_gt(num, 0);
2992 btref2.set(0, i);
2993 for(int j = 0; j < num; j++) {
2994 btref2.set(btref2[i] | (1 << alts[j]), i);
2995 }
2996 }
2997 }
2998
2999
3000 bool fw = true;
3001 uint32_t refidx = 0;
3002
3003 swa.initRead(
3004 btread, // read sequence
3005 btreadrc,
3006 btqual, // read qualities
3007 btqualrc,
3008 0, // offset of first character within 'read' to consider
3009 btread.length(), // offset of last char (exclusive) in 'read' to consider
3010 *sc_); // local-alignment score floor
3011
3012 DynProgFramer dpframe(false);
3013 size_t readgaps = 0;
3014 size_t refgaps = 0;
3015 size_t maxhalf = 0;
3016
3017 DPRect rect;
3018 dpframe.frameSeedExtensionRect(
3019 0, // ref offset implied by seed hit assuming no gaps
3020 btread.length(), // length of read sequence used in DP table
3021 btref2.length(), // length of reference
3022 readgaps, // max # of read gaps permitted in opp mate alignment
3023 refgaps, // max # of ref gaps permitted in opp mate alignment
3024 (size_t)nceil, // # Ns permitted
3025 maxhalf, // max width in either direction
3026 rect); // DP Rectangle
3027
3028 assert(rect.repOk());
3029
3030 size_t cminlen = 2000, cpow2 = 4;
3031
3032 swa.initRef(
3033 fw, // whether to align forward or revcomp read
3034 refidx, // reference ID
3035 rect, // DP rectangle
3036 btref2.wbuf(), // reference strings
3037 0, // offset of first reference char to align to
3038 btref2.length(), // offset of last reference char to align to
3039 btref2.length(), // length of reference sequence
3040 *sc_, // scoring scheme
3041 min_score, // minimum score
3042 true, // use 8-bit SSE if positions
3043 cminlen, // minimum length for using checkpointing scheme
3044 cpow2, // interval b/t checkpointed diags; 1 << this
3045 false, // triangular mini-fills?
3046 false // is this a seed extension?
3047 );
3048
3049
3050 TAlScore best = std::numeric_limits<TAlScore>::min();
3051 bool found = swa.align(rnd_, best);
3052
3053 #ifdef DEBUGLOG
3054 cerr << "found: " << found << "\t" << best << "\t" << "minsc: " << min_score << endl;
3055 #endif
3056
3057 if (found) {
3058 #ifdef DEBUGLOG
3059 //cerr << "CP " << "found: " << found << "\t" << best << "\t" << "minsc: " << min_score << endl;
3060 cerr << "REF : " << ref << endl;
3061 cerr << "READ: " << read << endl;
3062 #endif
3063
3064 SwResult res;
3065 int max_match_len = 0;
3066 res.reset();
3067 res.alres.init_raw_edits(&rawEdits_);
3068
3069 found = swa.nextAlignment(res, best, rnd_);
3070 if (found) {
3071 edits = res.alres.ned();
3072 //const TRefOff ref_off = res.alres.refoff();
3073 //const Coord& coord = res.alres.refcoord();
3074 coord = res.alres.refcoord();
3075 //assert_geq(genomeHit._joinedOff + coord.off(), genomeHit.refoff());
3076
3077 #ifdef DEBUGLOG
3078 cerr << "num edits: " << edits.size() << endl;
3079 cerr << "coord: " << coord.off();
3080 cerr << ", " << coord.ref();
3081 cerr << ", " << coord.orient();
3082 cerr << ", " << coord.joinedOff();
3083 cerr << endl;
3084 Edit::print(cerr, edits); cerr << endl;
3085 Edit::printQAlign(cerr, btread, edits);
3086 #endif
3087
3088 max_match_len = getMaxMatchLen(edits, btread.length());
3089 #ifdef DEBUGLOG
3090 cerr << "max match length: " << max_match_len << endl;
3091 #endif
3092 }
3093 #ifdef DEBUGLOG
3094 cerr << "nextAlignment: " << found << endl;
3095 cerr << "-------------------------" << endl;
3096 #endif
3097 }
3098
3099 return 0;
3100 }
3101
doTest(const RepeatParameter & rp,const string & refstr,const string & readstr)3102 void RB_SWAligner::doTest(const RepeatParameter& rp,
3103 const string& refstr,
3104 const string& readstr)
3105 {
3106 init_dyn(rp);
3107
3108 doTestCase1(refstr,
3109 readstr,
3110 rp.max_edit);
3111 }
3112
doTestCase1(const string & refstr,const string & readstr,TIndexOffU rpt_edit)3113 void RB_SWAligner::doTestCase1(const string& refstr,
3114 const string& readstr,
3115 TIndexOffU rpt_edit)
3116 {
3117 cerr << "doTestCase1----------------" << endl;
3118 EList<Edit> edits;
3119 Coord coord;
3120
3121 if (refstr.length() == 0 ||
3122 readstr.length() == 0) {
3123 return;
3124 }
3125
3126 EList<Edit> ed;
3127
3128 string pad;
3129 makePadString(refstr, readstr, pad, 5);
3130
3131 string ref2 = pad + refstr + pad;
3132 string read2 = pad + readstr + pad;
3133 alignStrings(refstr, readstr, edits, coord);
3134
3135 size_t left = pad.length();
3136 size_t right = left + readstr.length();
3137
3138 edits.reserveExact(ed.size());
3139 for(size_t i = 0; i < ed.size(); i++) {
3140 if(ed[i].pos >= left && ed[i].pos <= right) {
3141 edits.push_back(ed[i]);
3142 edits.back().pos -= left;
3143 }
3144 }
3145
3146
3147 #if 0
3148 RepeatGroup rg;
3149
3150 rg.edits = edits;
3151 rg.coord = coord;
3152 rg.seq = readstr;
3153 rg.base_offset = 0;
3154
3155 string chr_name = "rep";
3156
3157 cerr << "REF : " << refstr << endl;
3158 cerr << "READ: " << readstr << endl;
3159 size_t snpids = 0;
3160 rg.buildSNPs(snpids);
3161 rg.writeSNPs(cerr, chr_name); cerr << endl;
3162 #endif
3163 }
3164
3165
3166 template<typename TStr>
RepeatBuilder(TStr & s,const EList<RefRecord> & szs,const EList<string> & ref_names,bool forward_only,const string & filename)3167 RepeatBuilder<TStr>::RepeatBuilder(TStr& s,
3168 const EList<RefRecord>& szs,
3169 const EList<string>& ref_names,
3170 bool forward_only,
3171 const string& filename) :
3172 s_(s),
3173 coordHelper_(s.length(), forward_only ? s.length() : s.length() / 2, szs, ref_names),
3174 forward_only_(forward_only),
3175 filename_(filename),
3176 forward_length_(forward_only ? s.length() : s.length() / 2)
3177 {
3178 cerr << "RepeatBuilder: " << filename_ << endl;
3179 }
3180
3181 template<typename TStr>
~RepeatBuilder()3182 RepeatBuilder<TStr>::~RepeatBuilder()
3183 {
3184 for(map<size_t, RB_Repeat*>::iterator it = repeat_map_.begin(); it != repeat_map_.end(); it++) {
3185 delete it->second;
3186 }
3187 repeat_map_.clear();
3188 }
3189
3190 template<typename TStr>
readSA(const RepeatParameter & rp,BlockwiseSA<TStr> & sa)3191 void RepeatBuilder<TStr>::readSA(const RepeatParameter& rp,
3192 BlockwiseSA<TStr>& sa)
3193 {
3194 TIndexOffU count = 0;
3195 subSA_.init(s_.length() + 1, rp.min_repeat_len, rp.repeat_count);
3196
3197 while(count < s_.length() + 1) {
3198 TIndexOffU saElt = sa.nextSuffix();
3199 count++;
3200
3201 if(count && (count % 10000000 == 0)) {
3202 cerr << "SA count " << count << endl;
3203 }
3204
3205 if(saElt == s_.length()) {
3206 assert_eq(count, s_.length() + 1);
3207 break;
3208 }
3209
3210 subSA_.push_back(s_, coordHelper_, saElt, count == s_.length());
3211 }
3212
3213 cerr << "subSA size is " << subSA_.size() << endl;
3214 subSA_.dump();
3215 #if 0
3216 for(size_t i = 0; i < subSA_.size(); i++) {
3217 TIndexOffU joinedOff = subSA_[i];
3218 fp << setw(10) << joinedOff << " " << getString(s_, joinedOff, rp.seed_len) << endl;
3219 }
3220 #endif
3221 cerr << "subSA mem Usage: " << subSA_.getMemUsage() << endl << endl;
3222 }
3223
3224 template<typename TStr>
readSA(const RepeatParameter & rp,const BitPackedArray & sa)3225 void RepeatBuilder<TStr>::readSA(const RepeatParameter &rp,
3226 const BitPackedArray &sa)
3227 {
3228 TIndexOffU count = 0;
3229
3230 subSA_.init(s_.length() + 1, rp.min_repeat_len, rp.repeat_count);
3231
3232 for(size_t i = 0; i < sa.size(); i++) {
3233 TIndexOffU saElt = sa[i];
3234 count++;
3235
3236 if(count && (count % 10000000 == 0)) {
3237 cerr << "RB count " << count << endl;
3238 }
3239
3240 if(saElt == s_.length()) {
3241 assert_eq(count, s_.length() + 1);
3242 break;
3243 }
3244
3245 subSA_.push_back(s_, coordHelper_, saElt, count == s_.length());
3246 }
3247
3248 cerr << "subSA size: " << endl;
3249 subSA_.dump();
3250 #if 0
3251 for(size_t i = 0; i < subSA_.size(); i++) {
3252 TIndexOffU joinedOff = subSA_[i];
3253 fp << setw(10) << joinedOff << " " << getString(s_, joinedOff, rp.seed_len) << endl;
3254 }
3255 #endif
3256 cerr << "subSA mem Usage: " << subSA_.getMemUsage() << endl << endl;
3257 }
3258
3259 template<typename TStr>
build(const RepeatParameter & rp)3260 void RepeatBuilder<TStr>::build(const RepeatParameter& rp)
3261 {
3262 string rpt_len_str;
3263 rpt_len_str = to_string(rp.min_repeat_len) + "-" + to_string(rp.max_repeat_len);
3264
3265 string seed_filename = filename_ + ".rep." + rpt_len_str + ".seed";
3266 ofstream fp(seed_filename.c_str());
3267
3268 swaligner_.init_dyn(rp);
3269
3270 RB_RepeatManager* repeat_manager = new RB_RepeatManager;
3271
3272 EList<RB_RepeatBase> repeatBases;
3273 subSA_.buildRepeatBase(s_,
3274 coordHelper_,
3275 rp.max_repeat_len,
3276 repeatBases);
3277
3278 size_t repeat_id = 0;
3279 for(size_t i = 0; i < repeatBases.size(); i++) {
3280 addRepeatGroup(rp,
3281 repeat_id,
3282 *repeat_manager,
3283 repeatBases[i],
3284 fp);
3285 }
3286
3287 {
3288 // Build and test minimizer-based k-mer table
3289 const size_t window = RB_Minimizer<TStr>::default_w;
3290 const size_t k = RB_Minimizer<TStr>::default_k;
3291 RB_KmerTable kmer_table;
3292 EList<string> seqs;
3293 seqs.reserveExact(repeat_map_.size());
3294 for(map<size_t, RB_Repeat*>::const_iterator it = repeat_map_.begin(); it != repeat_map_.end(); it++) {
3295 const RB_Repeat& repeat = *(it->second);
3296 assert(repeat.satisfy(rp));
3297 seqs.expand();
3298 seqs.back() = repeat.consensus();
3299 }
3300 kmer_table.build(seqs,
3301 window,
3302 k);
3303 kmer_table.dump(cerr);
3304 cerr << endl;
3305
3306 string query, rc_query;
3307 EList<pair<uint64_t, size_t> > minimizers;
3308 size_t total = 0, num_repeat = 0, correct = 0, false_positive = 0, false_negative = 0;
3309 for(size_t i = 0; i + rp.min_repeat_len <= forward_length_; i += 1000) {
3310 if(coordHelper_.getEnd(i) != coordHelper_.getEnd(i + rp.min_repeat_len))
3311 continue;
3312 query = getString(s_, i, rp.min_repeat_len);
3313 rc_query = reverseComplement(query);
3314
3315 TIndexOffU idx = subSA_.find_repeat_idx(s_, query);
3316 const EList<TIndexOffU>& test_repeat_index = subSA_.getRepeatIndex();
3317 bool repeat = (idx < test_repeat_index.size());
3318 bool est_repeat = kmer_table.isRepeat(query,
3319 rc_query,
3320 minimizers);
3321
3322 total++;
3323 if(repeat) num_repeat++;
3324 if(repeat == est_repeat) {
3325 correct++;
3326 } else {
3327 if(est_repeat) {
3328 false_positive++;
3329 } else {
3330 false_negative++;
3331 assert(false);
3332 }
3333 }
3334 }
3335
3336 cerr << "total: " << total << endl;
3337 cerr << "repeat: " << num_repeat << endl;
3338 cerr << "correct: " << correct << endl;
3339 cerr << "false positive: " << false_positive << endl;
3340 cerr << "false negative: " << false_negative << endl;
3341 cerr << endl;
3342
3343 ELList<RB_Alignment> position2D; EList<RB_Alignment> alignments;
3344 size_t repeat_total = 0, repeat_aligned = 0;
3345 const EList<TIndexOffU>& test_repeat_index = subSA_.getRepeatIndex();
3346 size_t interval = 1;
3347 if(test_repeat_index.size() >= 1000) {
3348 interval = test_repeat_index.size() / 1000;
3349 }
3350 size_t total_alignments = 0, max_alignments = 0;
3351 string query2, rc_query2;
3352 for(size_t i = 0; i < test_repeat_index.size(); i += interval) {
3353 TIndexOffU saElt_idx = test_repeat_index[i];
3354 TIndexOffU saElt_idx_end = (i + 1 < test_repeat_index.size() ? test_repeat_index[i+1] : subSA_.size());
3355 TIndexOffU saElt_size = saElt_idx_end - saElt_idx;
3356 TIndexOffU saElt = subSA_[saElt_idx];
3357 query = getString(s_, saElt, rp.min_repeat_len);
3358 query2 = query;
3359
3360 // introduce three mismatches into a query when the query is at lest 100-bp
3361 if(rp.min_repeat_len >= 100) {
3362 const size_t mid_pos1 = (size_t)(rp.min_repeat_len * 0.1);
3363 if(query2[mid_pos1] == 'A') {
3364 query2[mid_pos1] = 'C';
3365 } else {
3366 query2[mid_pos1] = 'A';
3367 }
3368 const size_t mid_pos2 = (size_t)(rp.min_repeat_len * 0.5);
3369 if(query2[mid_pos2] == 'C') {
3370 query2[mid_pos2] = 'G';
3371 } else {
3372 query2[mid_pos2] = 'C';
3373 }
3374 const size_t mid_pos3 = (size_t)(rp.min_repeat_len * 0.9);
3375 if(query2[mid_pos3] == 'G') {
3376 query2[mid_pos3] = 'T';
3377 } else {
3378 query2[mid_pos3] = 'G';
3379 }
3380 }
3381
3382 repeat_total += saElt_size;
3383
3384 size_t found = 0;
3385 size_t cur_alignments = 0;
3386 if(kmer_table.isRepeat(query2, minimizers)) {
3387 kmer_table.findAlignments(query2,
3388 minimizers,
3389 position2D,
3390 alignments);
3391 total_alignments += (alignments.size() * saElt_size);
3392 cur_alignments = alignments.size();
3393 TIndexOffU baseoff = 0;
3394 for(size_t s = 0; s < seqs.size(); s++) {
3395 int spos = seqs[s].find(query);
3396 if(spos != string::npos) {
3397 for(size_t a = 0; a < alignments.size(); a++) {
3398 if(alignments[a].pos == baseoff + spos) {
3399 found++;
3400 }
3401 }
3402 }
3403 baseoff += seqs[s].length();
3404 }
3405
3406 assert_leq(found, 1);
3407 }
3408
3409 rc_query = reverseComplement(query);
3410 rc_query2 = reverseComplement(query2);
3411 size_t rc_found = 0;
3412 if(kmer_table.isRepeat(rc_query2, minimizers)) {
3413 kmer_table.findAlignments(rc_query2,
3414 minimizers,
3415 position2D,
3416 alignments);
3417 total_alignments += (alignments.size() * saElt_size);
3418 cur_alignments += alignments.size();
3419 if(cur_alignments > max_alignments) {
3420 max_alignments = cur_alignments;
3421 }
3422
3423 TIndexOffU baseoff = 0;
3424 for(size_t s = 0; s < seqs.size(); s++) {
3425 int spos = seqs[s].find(rc_query);
3426 if(spos != string::npos) {
3427 for(size_t a = 0; a < alignments.size(); a++) {
3428 if(alignments[a].pos == baseoff + spos) {
3429 rc_found++;
3430 }
3431 }
3432 }
3433 baseoff += seqs[s].length();
3434 }
3435 assert_leq(rc_found, 1);
3436 }
3437
3438 if(found + rc_found == 1) {
3439 repeat_aligned += saElt_size;
3440 }
3441 }
3442
3443 cerr << "num repeats: " << repeat_total << endl;
3444 cerr << "repeat aligned using minimizers: " << repeat_aligned << endl;
3445 cerr << "average number of alignments: " << (float)total_alignments / repeat_total << endl;
3446 cerr << "max alignment: " << max_alignments << endl;
3447 cerr << endl;
3448 }
3449
3450 cerr << "number of repeats is " << repeat_map_.size() << endl;
3451
3452 size_t total_rep_seq_len = 0, total_allele_seq_len = 0;
3453 size_t i = 0;
3454 for(map<size_t, RB_Repeat*>::iterator it = repeat_map_.begin(); it != repeat_map_.end(); it++, i++) {
3455 RB_Repeat& repeat = *(it->second);
3456 if(!repeat.satisfy(rp))
3457 continue;
3458
3459 repeat.saveSeedExtension(rp,
3460 s_,
3461 coordHelper_,
3462 i,
3463 fp,
3464 total_rep_seq_len,
3465 total_allele_seq_len);
3466 }
3467
3468 size_t total_qual_seeds = 0;
3469 for(map<size_t, RB_Repeat*>::iterator it = repeat_map_.begin(); it != repeat_map_.end(); it++, i++) {
3470 RB_Repeat& repeat = *(it->second);
3471 EList<SeedExt>& seeds = repeat.seeds();
3472 for(size_t i = 0; i < seeds.size(); i++) {
3473 if(seeds[i].getLength() < rp.min_repeat_len)
3474 continue;
3475 total_qual_seeds++;
3476 }
3477 }
3478
3479 cerr << "total repeat sequence length: " << total_rep_seq_len << endl;
3480 cerr << "total allele sequence length: " << total_allele_seq_len << endl;
3481 cerr << "total number of seeds including those that are position-wise different, but sequence-wise identical: " << total_qual_seeds << endl;
3482 cerr << endl;
3483 fp << "total repeat sequence length: " << total_rep_seq_len << endl;
3484 fp << "total allele sequence length: " << total_allele_seq_len << endl;
3485 fp << "total number of seeds including those that are position-wise different, but sequence-wise identical: " << total_qual_seeds << endl;
3486 fp.close();
3487
3488 delete repeat_manager;
3489 repeat_manager = NULL;
3490
3491 // remove non-qualifying repeats
3492 // and update repeat IDs for those remaining
3493 {
3494 map<size_t, RB_Repeat*> temp_repeat_map;
3495 size_t i = 0;
3496 for(map<size_t, RB_Repeat*>::iterator it = repeat_map_.begin(); it != repeat_map_.end(); it++) {
3497 RB_Repeat* repeat = it->second;
3498 if(!repeat->satisfy(rp)) {
3499 delete repeat;
3500 continue;
3501 }
3502 repeat->repeat_id(i);
3503 temp_repeat_map[repeat->repeat_id()] = repeat;
3504 i++;
3505 }
3506 repeat_map_ = temp_repeat_map;
3507 }
3508
3509 const bool sanity_check = true;
3510 if(sanity_check) {
3511 EList<pair<size_t, size_t> > kmer_table;
3512 string query;
3513 size_t total = 0, match = 0;
3514 EList<TIndexOffU> positions;
3515 const EList<TIndexOffU>& test_repeat_index = subSA_.getRepeatIndex();
3516 size_t interval = 1;
3517 if(test_repeat_index.size() >= 10000) {
3518 interval = test_repeat_index.size() / 10000;
3519 }
3520 for(size_t i = 0; i < test_repeat_index.size(); i += interval) {
3521 TIndexOffU saElt_idx = test_repeat_index[i];
3522 TIndexOffU saElt_idx_end = (i + 1 < test_repeat_index.size() ? test_repeat_index[i+1] : subSA_.size());
3523 positions.clear();
3524 for(size_t j = saElt_idx; j < saElt_idx_end; j++) {
3525 positions.push_back(subSA_[j]);
3526 #ifndef NDEBUG
3527 if(j > saElt_idx) {
3528 TIndexOffU lcp_len = getLCP(s_,
3529 coordHelper_,
3530 positions[0],
3531 positions.back(),
3532 rp.min_repeat_len);
3533 assert_eq(lcp_len, rp.min_repeat_len);
3534 }
3535
3536 TIndexOffU saElt = subSA_[j];
3537 TIndexOffU start = coordHelper_.getStart(saElt);
3538 TIndexOffU start2 = coordHelper_.getStart(saElt + rp.min_repeat_len - 1);
3539 assert_eq(start, start2);
3540 #endif
3541 }
3542
3543 TIndexOffU saElt = subSA_[saElt_idx];
3544 size_t true_count = saElt_idx_end - saElt_idx;
3545 getString(s_, saElt, rp.min_repeat_len, query);
3546 total++;
3547
3548 size_t count = 0, rc_count = 0;
3549 string rc_query = reverse_complement(query);
3550 for(map<size_t, RB_Repeat*>::iterator it = repeat_map_.begin(); it != repeat_map_.end(); it++) {
3551 RB_Repeat& repeat = *(it->second);
3552 int pos = repeat.consensus().find(query);
3553 if(pos != string::npos) {
3554 for(size_t s = 0; s < repeat.seeds().size(); s++) {
3555 SeedExt& seed = repeat.seeds()[s];
3556 string seq;
3557 seed.getExtendedSeedSequence(s_, seq);
3558 if(seq.find(query) != string::npos)
3559 count++;
3560 }
3561 }
3562 pos = repeat.consensus().find(rc_query);
3563 if(pos != string::npos) {
3564 for(size_t s = 0; s < repeat.seeds().size(); s++) {
3565 SeedExt& seed = repeat.seeds()[s];
3566 string seq;
3567 seed.getExtendedSeedSequence(s_, seq);
3568 if(seq.find(rc_query) != string::npos)
3569 rc_count++;
3570 }
3571 }
3572 }
3573
3574 if(count == true_count || rc_count == true_count) {
3575 match++;
3576 } else if(total - match <= 10) {
3577 cerr << " query: " << query << endl;
3578 cerr << "rc_query: " << rc_query << endl;
3579 cerr << "true count: " << true_count << endl;
3580 cerr << "found count: " << count << endl;
3581 cerr << "rc found count: " << rc_count << endl;
3582 cerr << endl;
3583 }
3584 }
3585
3586 cerr << "RepeatBuilder: sanity check: " << match << " passed (out of " << total << ")" << endl << endl;
3587 }
3588 }
3589
3590 template<typename TStr>
writeHaploType(const EList<SeedHP> & haplo_lists,const EList<SeedExt> & seeds,TIndexOffU & hapl_id_base,const string & seq_name,ostream & fp)3591 void RepeatBuilder<TStr>::writeHaploType(const EList<SeedHP>& haplo_lists,
3592 const EList<SeedExt>& seeds,
3593 TIndexOffU& hapl_id_base,
3594 const string& seq_name,
3595 ostream &fp)
3596 {
3597 if(haplo_lists.size() == 0) {
3598 return;
3599 }
3600
3601 for(size_t i = 0; i < haplo_lists.size(); i++) {
3602 const SeedHP &haploType = haplo_lists[i];
3603
3604 fp << "rpht" << hapl_id_base++;
3605 fp << "\t" << seq_name;
3606 fp << "\t" << haploType.range.first;
3607 fp << "\t" << haploType.range.second;
3608 fp << "\t";
3609
3610 assert_gt(haploType.snpIDs.size(), 0);
3611
3612 for (size_t j = 0; j < haploType.snpIDs.size(); j++) {
3613 if(j) {
3614 fp << ",";
3615 }
3616 fp << haploType.snpIDs[j];
3617 }
3618 fp << endl;
3619 }
3620 }
3621
3622 template<typename TStr>
writeAllele(TIndexOffU grp_id,TIndexOffU allele_id,Range range,const string & seq_name,TIndexOffU baseoff,const EList<SeedExt> & seeds,ostream & fp)3623 void RepeatBuilder<TStr>::writeAllele(TIndexOffU grp_id,
3624 TIndexOffU allele_id,
3625 Range range,
3626 const string& seq_name,
3627 TIndexOffU baseoff,
3628 const EList<SeedExt>& seeds,
3629 ostream &fp)
3630 {
3631 // >rpt_name*0\trep\trep_pos\trep_len\tpos_count\t0
3632 // chr_name:pos:direction chr_name:pos:direction
3633 //
3634 // >rep1*0 rep 0 100 470 0
3635 // 22:112123123:+ 22:1232131113:+
3636 //
3637 size_t snp_size = seeds[range.first].snps.size();
3638 size_t pos_size = range.second - range.first;
3639 fp << ">";
3640 fp << "rpt_" << grp_id << "*" << allele_id;
3641 fp << "\t" << seq_name;
3642 fp << "\t" << baseoff + seeds[range.first].consensus_pos.first;
3643 fp << "\t" << seeds[range.first].consensus_pos.second - seeds[range.first].consensus_pos.first;
3644 fp << "\t" << pos_size;
3645 fp << "\t" << snp_size;
3646
3647 fp << "\t";
3648 for(size_t i = 0; i < snp_size; i++) {
3649 if(i > 0) {fp << ",";}
3650 fp << "rps" << seeds[range.first].snps[i]->id;
3651 }
3652 fp << endl;
3653
3654 // print positions
3655 for(size_t i = 0; i < pos_size; i++) {
3656 if(i > 0 && (i % 10 == 0)) {
3657 fp << endl;
3658 }
3659
3660 if(i % 10 != 0) {
3661 fp << " ";
3662 }
3663 string chr_name;
3664 TIndexOffU pos_in_chr;
3665 TIndexOffU joinedOff = seeds[range.first + i].pos.first;
3666
3667 bool fw = true;
3668 if(joinedOff < forward_length_) {
3669 fw = true;
3670 } else {
3671 fw = false;
3672 joinedOff = s_.length() - joinedOff - seeds[range.first].getLength();
3673 }
3674
3675 coordHelper_.getGenomeCoord(joinedOff, chr_name, pos_in_chr);
3676
3677 char direction = fw ? '+' : '-';
3678 fp << chr_name << ":" << pos_in_chr << ":" << direction;
3679 }
3680
3681 fp << endl;
3682 }
3683
3684 template<typename TStr>
writeSNPs(ostream & fp,const string & rep_chr_name,const ESet<Edit> & editset)3685 void RepeatBuilder<TStr>::writeSNPs(ostream& fp,
3686 const string& rep_chr_name,
3687 const ESet<Edit>& editset)
3688 {
3689 for(size_t i = 0; i < editset.size(); i++) {
3690 const Edit& ed = editset[i];
3691
3692 if(ed.isMismatch()) {
3693 fp << "rps" << ed.snpID;
3694 fp << "\t" << "single";
3695 fp << "\t" << rep_chr_name;
3696 fp << "\t" << ed.pos;
3697 fp << "\t" << ed.qchr;
3698 fp << endl;
3699 } else {
3700 assert(false);
3701 }
3702 }
3703 }
3704
3705
3706 template<typename TStr>
saveFile(const RepeatParameter & rp)3707 void RepeatBuilder<TStr>::saveFile(const RepeatParameter& rp)
3708 {
3709 saveRepeats(rp);
3710 }
3711
checkRedundant(const RepeatParameter & rp,const map<size_t,RB_Repeat * > & repeat_map,const EList<TIndexOffU> & positions,EList<size_t> & to_remove) const3712 bool RB_RepeatManager::checkRedundant(const RepeatParameter& rp,
3713 const map<size_t, RB_Repeat*>& repeat_map,
3714 const EList<TIndexOffU>& positions,
3715 EList<size_t>& to_remove) const
3716 {
3717 to_remove.clear();
3718 bool replace = false;
3719 for(size_t i = 0; i < positions.size(); i++) {
3720 TIndexOffU seed_pos = positions[i];
3721 Range seed_range(seed_pos + rp.seed_len, 0);
3722 map<Range, EList<size_t> >::const_iterator it = range_to_repeats_.upper_bound(seed_range);
3723 if(it == range_to_repeats_.end())
3724 continue;
3725
3726 for(map<Range, EList<size_t> >::const_reverse_iterator rit(it); rit != range_to_repeats_.rend(); rit++) {
3727 Range repeat_range = rit->first;
3728 if(repeat_range.first + rp.max_repeat_len <= seed_pos)
3729 break;
3730
3731 const EList<size_t>& repeat_ids = rit->second;
3732 assert_gt(repeat_ids.size(), 0);
3733 for(size_t r = 0; r < repeat_ids.size(); r++) {
3734 size_t repeat_id = repeat_ids[r];
3735 size_t idx = to_remove.bsearchLoBound(repeat_id);
3736 if(idx < to_remove.size() && to_remove[idx] == repeat_id)
3737 continue;
3738
3739 bool overlap = seed_pos < repeat_range.second && seed_pos + rp.seed_len > repeat_range.first;
3740 if(!overlap)
3741 continue;
3742
3743 map<size_t, RB_Repeat*>::const_iterator it2 = repeat_map.find(repeat_id);
3744 assert(it2 != repeat_map.end());
3745 const RB_Repeat& repeat = *(it2->second);
3746 const EList<RB_AlleleCoord>& allele_ranges = repeat.seed_ranges();
3747 size_t num_contain = 0, num_overlap = 0, num_close = 0, num_overlap_bp = 0;
3748 size_t p = 0, p2 = 0;
3749 while(p < positions.size() && p2 < allele_ranges.size()) {
3750 RB_AlleleCoord range;
3751 range.left = positions[p];
3752 range.right = positions[p] + rp.seed_len;
3753 RB_AlleleCoord range2 = allele_ranges[p2];
3754 if(range2.contain(range)) {
3755 num_contain++;
3756 num_overlap_bp += rp.seed_len;
3757 } else {
3758 TIndexOffU overlap = range2.overlap_len(range);
3759 if(overlap > 0) {
3760 num_overlap++;
3761 num_overlap_bp += (range2.right - range.left);
3762 } else if(range.right + 10 > range2.left && range2.right + 10 > range.left) {
3763 num_close++;
3764 }
3765 }
3766 if(range.right <= range2.right) p++;
3767 else p2++;
3768 }
3769
3770 // if the number of matches is >= 90% of positions in the smaller group
3771 if((num_contain + num_overlap) * 10 + num_close * 8 >= min(positions.size(), allele_ranges.size()) * 9) {
3772 if(positions.size() <= allele_ranges.size()) {
3773 return true;
3774 } else {
3775 replace = true;
3776 to_remove.push_back(repeat_id);
3777 to_remove.sort();
3778 }
3779 }
3780 }
3781 }
3782
3783 // DK - check this out
3784 if(replace)
3785 break;
3786 }
3787 return false;
3788 }
3789
addRepeat(const RB_Repeat * repeat)3790 void RB_RepeatManager::addRepeat(const RB_Repeat* repeat)
3791 {
3792 const EList<RB_AlleleCoord>& allele_ranges = repeat->seed_ranges();
3793 for(size_t i = 0; i < allele_ranges.size(); i++) {
3794 Range allele_range(allele_ranges[i].left, allele_ranges[i].right);
3795 addRepeat(allele_range, repeat->repeat_id());
3796 }
3797 }
3798
addRepeat(Range range,size_t repeat_id)3799 void RB_RepeatManager::addRepeat(Range range, size_t repeat_id)
3800 {
3801 if(range_to_repeats_.find(range) == range_to_repeats_.end()) {
3802 range_to_repeats_[range] = EList<size_t>();
3803 }
3804 EList<size_t>& repeat_ids = range_to_repeats_[range];
3805 size_t idx = repeat_ids.bsearchLoBound(repeat_id);
3806 if(idx < repeat_ids.size() && repeat_ids[idx] == repeat_id)
3807 return;
3808 repeat_ids.push_back(repeat_id);
3809 repeat_ids.sort();
3810 }
3811
removeRepeat(const RB_Repeat * repeat)3812 void RB_RepeatManager::removeRepeat(const RB_Repeat* repeat)
3813 {
3814 const EList<RB_AlleleCoord>& allele_ranges = repeat->seed_ranges();
3815 for(size_t p = 0; p < allele_ranges.size(); p++) {
3816 Range range(allele_ranges[p].left, allele_ranges[p].right);
3817 removeRepeat(range, repeat->repeat_id());
3818 }
3819 }
3820
removeRepeat(Range range,size_t repeat_id)3821 void RB_RepeatManager::removeRepeat(Range range, size_t repeat_id)
3822 {
3823 EList<size_t>& repeat_ids = range_to_repeats_[range];
3824 TIndexOffU idx = repeat_ids.bsearchLoBound(repeat_id);
3825 if(idx < repeat_ids.size() && repeat_id == repeat_ids[idx]) {
3826 repeat_ids.erase(idx);
3827 if(repeat_ids.empty())
3828 range_to_repeats_.erase(range);
3829 }
3830 }
3831
showInfo(const RepeatParameter & rp,CoordHelper & coordHelper,const map<size_t,RB_Repeat * > & repeat_map,size_t num_case) const3832 void RB_RepeatManager::showInfo(const RepeatParameter& rp,
3833 CoordHelper& coordHelper,
3834 const map<size_t, RB_Repeat*>& repeat_map,
3835 size_t num_case) const
3836 {
3837 size_t count = 0;
3838 for(map<Range, EList<size_t> >::const_iterator it = range_to_repeats_.begin();
3839 it != range_to_repeats_.end(); it++) {
3840 const Range& range = it->first;
3841 if(range.second - range.first < rp.min_repeat_len) continue;
3842 map<Range, EList<size_t> >::const_iterator jt = it; jt++;
3843 for(; jt != range_to_repeats_.end(); jt++) {
3844 const Range& range2 = jt->first;
3845 if(range2.second - range2.first < rp.min_repeat_len) continue;
3846 if(range.second <= range2.first)
3847 break;
3848 if(count < num_case) {
3849 cerr << "range (" << range.first << ", " << range.second << ") vs. range2 (";
3850 cerr << range2.first << ", " << range2.second << ")" << endl;
3851 for(size_t i = 0; i < it->second.size(); i++) {
3852 cerr << "\t1 " << it->second[i] << endl;
3853 const RB_Repeat* repeat = repeat_map.find(it->second[i])->second;
3854 repeat->showInfo(rp, coordHelper);
3855 }
3856 for(size_t i = 0; i < jt->second.size(); i++) {
3857 cerr << "\t2 " << jt->second[i] << endl;
3858 const RB_Repeat* repeat = repeat_map.find(jt->second[i])->second;
3859 repeat->showInfo(rp, coordHelper);
3860 }
3861 cerr << endl << endl;
3862 }
3863 count++;
3864 }
3865 }
3866 cerr << "ShowInfo - count: " << count << endl;
3867 }
3868
3869 template<typename TStr>
reassignSeeds(const RepeatParameter & rp,size_t repeat_bid,size_t repeat_eid,EList<SeedExt> & seeds)3870 void RepeatBuilder<TStr>::reassignSeeds(const RepeatParameter& rp,
3871 size_t repeat_bid, // repeat begin id
3872 size_t repeat_eid, // repeat end id
3873 EList<SeedExt>& seeds)
3874 {
3875 assert_lt(repeat_bid, repeat_eid);
3876 EList<bool> updated;
3877 updated.resizeExact(repeat_eid - repeat_bid);
3878 updated.fillZero();
3879
3880 string seq;
3881 for(size_t s = 0; s < seeds.size(); s++) {
3882 SeedExt& seed = seeds[s];
3883 size_t max_repeat_id = repeat_bid;
3884 size_t max_ext_len = 0;
3885 for(size_t i = repeat_bid; i < repeat_eid; i++) {
3886 map<size_t, RB_Repeat*>::iterator it = repeat_map_.find(i);
3887 assert(it != repeat_map_.end());
3888 const RB_Repeat& repeat = *(it->second);
3889 const string& consensus = repeat.consensus();
3890 const size_t ext_len = (consensus.length() - rp.seed_len) / 2;
3891
3892 if(seed.bound.first + ext_len > seed.pos.first)
3893 continue;
3894 if(seed.pos.second + ext_len > seed.bound.second)
3895 continue;
3896
3897 getString(s_, seed.pos.first - ext_len, rp.seed_len + ext_len * 2, seq);
3898 assert_eq(consensus.length(), seq.length());
3899 size_t tmp_ext_len = 0;
3900 for(size_t j = 0; j < ext_len; j++, tmp_ext_len++) {
3901 if(seq[ext_len - j - 1] != consensus[ext_len - j - 1] ||
3902 seq[ext_len + rp.seed_len + j] != consensus[ext_len + rp.seed_len + j]) {
3903 break;
3904 }
3905 }
3906
3907 if(tmp_ext_len > max_ext_len) {
3908 max_repeat_id = i;
3909 max_ext_len = tmp_ext_len;
3910 }
3911 }
3912 if(rp.seed_len + max_ext_len * 2 >= rp.min_repeat_len) {
3913 seed.pos.first -= max_ext_len;
3914 seed.pos.second += max_ext_len;
3915 map<size_t, RB_Repeat*>::iterator it = repeat_map_.find(max_repeat_id);
3916 assert(it != repeat_map_.end());
3917 RB_Repeat& repeat = *(it->second);
3918 const string& consensus = repeat.consensus();
3919 const size_t ext_len = (consensus.length() - rp.seed_len) / 2;
3920 assert_leq(max_ext_len, ext_len);
3921 seed.consensus_pos.first = ext_len - max_ext_len;
3922 seed.consensus_pos.second = consensus.length() - (ext_len - max_ext_len);
3923 assert_leq(seed.consensus_pos.second - seed.consensus_pos.first, consensus.length());
3924 repeat.addSeed(seed);
3925 updated[max_repeat_id - repeat_bid] = true;
3926 }
3927 }
3928
3929 for(size_t i = repeat_bid; i < repeat_eid; i++) {
3930 if(!updated[i - repeat_bid])
3931 continue;
3932 map<size_t, RB_Repeat*>::iterator it = repeat_map_.find(i);
3933 assert(it != repeat_map_.end());
3934 RB_Repeat& repeat = *(it->second);
3935 repeat.update();
3936 }
3937 }
3938
3939 /**
3940 * TODO
3941 * @brief
3942 *
3943 * @param rpt_seq
3944 * @param rpt_range
3945 */
3946 template<typename TStr>
addRepeatGroup(const RepeatParameter & rp,size_t & repeat_id,RB_RepeatManager & repeat_manager,const RB_RepeatBase & repeatBase,ostream & fp)3947 void RepeatBuilder<TStr>::addRepeatGroup(const RepeatParameter& rp,
3948 size_t& repeat_id,
3949 RB_RepeatManager& repeat_manager,
3950 const RB_RepeatBase& repeatBase,
3951 ostream& fp)
3952 {
3953 RB_Repeat* repeat = new RB_Repeat;
3954 repeat->repeat_id(repeat_id);
3955 repeat->init(rp,
3956 s_,
3957 coordHelper_,
3958 subSA_,
3959 repeatBase);
3960
3961 assert(repeat_map_.find(repeat->repeat_id()) == repeat_map_.end());
3962 repeat_map_[repeat->repeat_id()] = repeat;
3963 repeat_id++;
3964 }
3965
3966 template<typename TStr>
checkSequenceMergeable(const string & ref,const string & read,EList<Edit> & edits,Coord & coord,TIndexOffU rpt_len,TIndexOffU max_edit)3967 bool RepeatBuilder<TStr>::checkSequenceMergeable(const string& ref,
3968 const string& read,
3969 EList<Edit>& edits,
3970 Coord& coord,
3971 TIndexOffU rpt_len,
3972 TIndexOffU max_edit)
3973 {
3974 size_t max_matchlen = 0;
3975 EList<Edit> ed;
3976
3977 string pad;
3978 swaligner_.makePadString(ref, read, pad, 5);
3979
3980 string ref2 = pad + ref;
3981 string read2 = pad + read;
3982
3983 swaligner_.alignStrings(ref2, read2, ed, coord);
3984
3985 // match should start from pad string
3986 if(coord.off() != 0) {
3987 return false;
3988 }
3989
3990 // no edits on pad string
3991 if(ed.size() > 0 && ed[0].pos < pad.length()) {
3992 return false;
3993 }
3994
3995 size_t left = pad.length();
3996 size_t right = left + read.length();
3997
3998 edits.clear();
3999 edits.reserveExact(ed.size());
4000 for(size_t i = 0; i < ed.size(); i++) {
4001 if(ed[i].pos >= left && ed[i].pos <= right) {
4002 edits.push_back(ed[i]);
4003 edits.back().pos -= left;
4004 }
4005 }
4006
4007 max_matchlen = getMaxMatchLen(edits, read.length());
4008
4009 #ifdef DEBUGLOG
4010 {
4011 cerr << "After pad removed" << endl;
4012 BTDnaString btread;
4013 btread.install(read.c_str(), true);
4014 Edit::print(cerr, edits); cerr << endl;
4015 Edit::printQAlign(cerr, btread, edits);
4016 }
4017 #endif
4018
4019 return (max_matchlen >= rpt_len);
4020 }
4021
4022
4023 template<typename TStr>
saveRepeats(const RepeatParameter & rp)4024 void RepeatBuilder<TStr>::saveRepeats(const RepeatParameter &rp)
4025 {
4026 ios_base::openmode mode = ios_base::out;
4027 if(rp.append_result) {
4028 mode |= ios_base::app;
4029 } else {
4030 mode |= ios_base::trunc;
4031 }
4032 // Generate SNPs
4033 size_t i = 0;
4034 for(map<size_t, RB_Repeat*>::iterator it = repeat_map_.begin(); it != repeat_map_.end(); it++, i++) {
4035 RB_Repeat& repeat = *(it->second);
4036 if(!repeat.satisfy(rp))
4037 continue;
4038
4039 // for each repeats
4040 repeat.generateSNPs(rp, s_, i);
4041 }
4042
4043 // save snp, consensus sequenuce, info
4044 string snp_fname = filename_ + ".rep.snp";
4045 string info_fname = filename_ + ".rep.info";
4046 string hapl_fname = filename_ + ".rep.haplotype";
4047
4048 ofstream snp_fp(snp_fname.c_str(), mode);
4049 ofstream info_fp(info_fname.c_str(), mode);
4050 ofstream hapl_fp(hapl_fname.c_str(), mode);
4051
4052 const string repName = "rep" + to_string(rp.min_repeat_len) + "-" + to_string(rp.max_repeat_len);
4053
4054 i = 0;
4055 TIndexOffU consensus_baseoff = 0;
4056 TIndexOffU snp_id_base = 0;
4057 TIndexOffU hapl_id_base = 0;
4058 for(map<size_t, RB_Repeat*>::iterator it = repeat_map_.begin(); it != repeat_map_.end(); it++, i++) {
4059 RB_Repeat& repeat = *(it->second);
4060 if(!repeat.satisfy(rp))
4061 continue;
4062
4063 // for each repeats
4064 repeat.saveSNPs(snp_fp, i, snp_id_base);
4065 saveAlleles(rp,
4066 repName,
4067 repeat,
4068 info_fp,
4069 hapl_fp,
4070 i,
4071 hapl_id_base,
4072 consensus_baseoff);
4073
4074 consensus_baseoff += repeat.consensus().length();
4075 }
4076
4077 snp_fp.close();
4078 info_fp.close();
4079 hapl_fp.close();
4080
4081 // save all consensus sequence
4082 saveConsensus(rp, repName);
4083 }
4084
4085 template<typename TStr>
saveConsensus(const RepeatParameter & rp,const string & repName)4086 void RepeatBuilder<TStr>::saveConsensus(const RepeatParameter &rp,
4087 const string& repName) {
4088 ios_base::openmode mode = ios_base::out;
4089 if(rp.append_result) {
4090 mode |= ios_base::app;
4091 } else {
4092 mode |= ios_base::trunc;
4093 }
4094
4095 string fa_fname = filename_ + ".rep.fa";
4096 ofstream fa_fp(fa_fname.c_str(), mode);
4097
4098 fa_fp << ">" << repName << endl;
4099
4100 size_t oskip = 0;
4101 for(map<size_t, RB_Repeat*>::iterator it = repeat_map_.begin(); it != repeat_map_.end(); it++) {
4102 RB_Repeat& repeat = *(it->second);
4103 if(!repeat.satisfy(rp))
4104 continue;
4105
4106 // for each repeats
4107 const string& constr = repeat.consensus();
4108 size_t si = 0;
4109 size_t constr_len = constr.length();
4110 while(si < constr_len) {
4111 size_t out_len = std::min((size_t)(output_width - oskip), (size_t)(constr_len - si));
4112 fa_fp << constr.substr(si, out_len);
4113
4114 if((oskip + out_len) == output_width) {
4115 fa_fp << endl;
4116 oskip = 0;
4117 } else {
4118 // last line
4119 oskip = oskip + out_len;
4120 }
4121
4122 si += out_len;
4123 }
4124 }
4125 if(oskip) {
4126 fa_fp << endl;
4127 }
4128
4129 fa_fp.close();
4130 }
4131
4132 template<typename TStr>
saveAlleles(const RepeatParameter & rp,const string & repName,RB_Repeat & repeat,ofstream & fp,ofstream & hapl_fp,TIndexOffU grp_id,TIndexOffU & hapl_id_base,TIndexOffU baseoff)4133 void RepeatBuilder<TStr>::saveAlleles(
4134 const RepeatParameter& rp,
4135 const string& repName,
4136 RB_Repeat& repeat,
4137 ofstream& fp,
4138 ofstream& hapl_fp,
4139 TIndexOffU grp_id,
4140 TIndexOffU& hapl_id_base,
4141 TIndexOffU baseoff)
4142 {
4143 const EList<SeedExt>& seeds = repeat.seeds();
4144 EList<SeedHP> haplo_lists;
4145 Range range(0, seeds.size());
4146
4147 int allele_id = 0;
4148
4149 for(size_t sb = range.first; sb < range.second;) {
4150 size_t se = sb + 1;
4151 for(; se < range.second; se++) {
4152 if(!(SeedExt::isSameConsensus(seeds[sb], seeds[se])
4153 && SeedExt::isSameSNPs(seeds[sb], seeds[se])
4154 && seeds[sb].aligned == seeds[se].aligned)) {
4155 break;
4156 }
4157 }
4158
4159 if(!seeds[sb].aligned) {
4160 sb = se;
4161 continue;
4162 }
4163
4164 if(seeds[sb].getLength() < rp.min_repeat_len) {
4165 sb = se;
4166 continue;
4167 }
4168
4169 // [sb, se) are same alleles
4170 writeAllele(grp_id, allele_id, Range(sb, se),
4171 repName, baseoff, seeds, fp);
4172 generateHaploType(Range(sb, se), seeds, haplo_lists);
4173
4174 allele_id++;
4175 sb = se;
4176 }
4177
4178 // sort HaploType List by pos
4179 haplo_lists.sort();
4180 writeHaploType(haplo_lists, seeds, hapl_id_base, repName, hapl_fp);
4181 }
4182
4183 template<typename TStr>
generateHaploType(Range range,const EList<SeedExt> & seeds,EList<SeedHP> & haplo_list)4184 void RepeatBuilder<TStr>::generateHaploType(Range range, const EList<SeedExt> &seeds, EList<SeedHP> &haplo_list)
4185 {
4186 const EList<SeedSNP *>& snps = seeds[range.first].snps;
4187 if(snps.size() == 0)
4188 return;
4189
4190 // right-most position
4191 TIndexOffU max_right_pos = seeds[range.first].consensus_pos.second - 1;
4192
4193 #ifndef NDEBUG
4194 for(size_t i = 0; i < snps.size(); i++) {
4195 assert_leq(snps[i]->pos, max_right_pos);
4196 }
4197 #endif
4198
4199 // create haplotypes of at least 16 bp long to prevent combinations of SNPs
4200 // break a list of SNPs into several haplotypes if a SNP is far from the next SNP in the list
4201 const TIndexOffU min_ht_len = 16;
4202 size_t eb = 0, ee = 1;
4203 while(ee < snps.size() + 1) {
4204 if(ee == snps.size() ||
4205 snps[eb]->pos + (min_ht_len << 1) < snps[ee]->pos) {
4206 TIndexOffU left_pos = snps[eb]->pos;
4207 TIndexOffU right_pos = snps[ee-1]->pos;
4208
4209 if(snps[ee-1]->type == EDIT_TYPE_READ_GAP) {
4210 right_pos += snps[ee-1]->len;
4211 }
4212 if(left_pos + min_ht_len - 1 > right_pos) {
4213 right_pos = left_pos + min_ht_len - 1;
4214 }
4215 right_pos = min<TIndexOffU>(max_right_pos, right_pos);
4216 assert_leq(left_pos, right_pos);
4217
4218 SeedHP seedHP;
4219
4220 seedHP.range.first = left_pos;
4221 seedHP.range.second = right_pos;
4222 for(size_t i = eb; i < ee; i++) {
4223 string snp_ids = "rps" + to_string(snps[i]->id);
4224 seedHP.snpIDs.push_back(snp_ids);
4225 }
4226
4227 // Add to haplo_list
4228 bool found = false;
4229 for(size_t i = 0; i < haplo_list.size(); i++) {
4230 if(haplo_list[i] == seedHP) {
4231 found = true;
4232 break;
4233 }
4234 }
4235 if(!found) {
4236 // Add
4237 haplo_list.push_back(seedHP);
4238 }
4239
4240 eb = ee;
4241 }
4242 ee++;
4243 }
4244 }
4245
4246
init(TIndexOffU sa_size,TIndexOffU seed_len,TIndexOffU seed_count)4247 void RB_SubSA::init(TIndexOffU sa_size,
4248 TIndexOffU seed_len,
4249 TIndexOffU seed_count)
4250 {
4251 assert_gt(sa_size, 1);
4252 sa_size_ = sa_size;
4253 assert_gt(seed_len, 0);
4254 seed_len_ = seed_len;
4255 assert_gt(seed_count, 1);
4256 seed_count_ = seed_count;
4257 temp_suffixes_.clear();
4258
4259 repeat_list_.clear();
4260 repeat_index_.clear();
4261 }
4262
4263 template<typename TStr>
push_back(const TStr & s,CoordHelper & coordHelper,TIndexOffU saElt,bool lastInput)4264 void RB_SubSA::push_back(const TStr& s,
4265 CoordHelper& coordHelper,
4266 TIndexOffU saElt,
4267 bool lastInput)
4268 {
4269 if(saElt + seed_len() <= coordHelper.getEnd(saElt)) {
4270 if(seed_count_ == 1) {
4271 repeat_list_.push_back(saElt);
4272 } else {
4273 assert_gt(seed_count_, 1);
4274
4275 if(temp_suffixes_.empty()) {
4276 temp_suffixes_.push_back(saElt);
4277 return;
4278 }
4279
4280 TIndexOffU prev_saElt = temp_suffixes_.back();
4281
4282 // calculate common prefix length between two text.
4283 // text1 is started from prev_saElt and text2 is started from saElt
4284 bool same = isSameSequenceUpto(s,
4285 coordHelper,
4286 prev_saElt,
4287 saElt,
4288 seed_len_);
4289
4290 if(same) {
4291 temp_suffixes_.push_back(saElt);
4292 }
4293
4294 if(!same || lastInput) {
4295 if(temp_suffixes_.size() >= seed_count_) {
4296 repeat_index_.push_back(repeat_list_.size());
4297 temp_suffixes_.sort();
4298 for(size_t pi = 0; pi < temp_suffixes_.size(); pi++) {
4299 repeat_list_.push_back(temp_suffixes_[pi]);
4300 }
4301 }
4302 temp_suffixes_.clear();
4303 if(!lastInput) {
4304 temp_suffixes_.push_back(saElt);
4305 } else {
4306 temp_suffixes_.nullify();
4307 }
4308 }
4309 }
4310 }
4311
4312 if(lastInput) {
4313 size_t bit = sizeof(uint32_t) * 8;
4314 size_t num = (repeat_list_.size() + bit - 1) / bit;
4315 done_.resizeExact(num);
4316 done_.fillZero();
4317
4318 #ifndef NDEBUG
4319 string prev_seq = "";
4320 for(size_t i = 0; i < repeat_index_.size(); i++) {
4321 TIndexOffU saBegin = repeat_index_[i];
4322 TIndexOffU saEnd = (i + 1 < repeat_index_.size() ? repeat_index_[i+1] : repeat_list_.size());
4323 string seq = getString(s, repeat_list_[saBegin], seed_len());
4324 for(size_t j = saBegin + 1; j < saEnd; j++) {
4325 string tmp_seq = getString(s, repeat_list_[j], seed_len());
4326 assert_eq(seq, tmp_seq);
4327 }
4328 if(prev_seq != "" ) {
4329 assert_lt(prev_seq, seq);
4330 }
4331 prev_seq = seq;
4332 }
4333 #endif
4334 }
4335 }
4336
4337 template<typename TStr>
find(const TStr & s,const string & seq) const4338 Range RB_SubSA::find(const TStr& s,
4339 const string& seq) const
4340 {
4341 TIndexOffU i = find_repeat_idx(s, seq);
4342 if(i >= repeat_index_.size())
4343 return Range(0, 0);
4344
4345 Range range(repeat_index_[i],
4346 i + 1 < repeat_index_.size() ? repeat_index_[i+1] : repeat_list_.size());
4347 return range;
4348 }
4349
4350 template<typename TStr>
find_repeat_idx(const TStr & s,const string & seq) const4351 TIndexOffU RB_SubSA::find_repeat_idx(const TStr& s,
4352 const string& seq) const
4353 {
4354 assert_eq(seq.length(), seed_len_);
4355
4356 string temp;
4357 size_t l = 0, r = repeat_index_.size();
4358 while(l < r) {
4359 size_t m = (r + l) >> 1;
4360 TIndexOffU saElt = repeat_list_[repeat_index_[m]];
4361 getString(s, saElt, seed_len_, temp);
4362 if(seq == temp) {
4363 return m;
4364 } else if(seq < temp) {
4365 r = m;
4366 } else {
4367 assert(seq > temp);
4368 l = m + 1;
4369 }
4370 }
4371 return repeat_index_.size();
4372 }
4373
setDone(TIndexOffU off,TIndexOffU len)4374 void RB_SubSA::setDone(TIndexOffU off, TIndexOffU len)
4375 {
4376 assert_leq(off + len, repeat_list_.size());
4377 const TIndexOffU bit = sizeof(uint32_t) * 8;
4378 for(TIndexOffU i = off; i < off + len; i++) {
4379 TIndexOffU quotient = i / bit;
4380 TIndexOffU remainder = i % bit;
4381 assert_lt(quotient, done_.size());
4382 uint32_t num = done_[quotient];
4383 num = num | (1 << remainder);
4384 done_[quotient] = num;
4385 }
4386 }
4387
isDone(TIndexOffU off,TIndexOffU len) const4388 bool RB_SubSA::isDone(TIndexOffU off, TIndexOffU len) const
4389 {
4390 assert_leq(off + len, repeat_list_.size());
4391 const TIndexOffU bit = sizeof(uint32_t) * 8;
4392 for(TIndexOffU i = off; i < off + len; i++) {
4393 TIndexOffU quotient = i / bit;
4394 TIndexOffU remainder = i % bit;
4395 assert_lt(quotient, done_.size());
4396 uint32_t num = done_[quotient];
4397 num = num & (1 << remainder);
4398 if(num == 0)
4399 return false;
4400 }
4401
4402 return true;
4403 }
4404
4405 template<typename TStr>
buildRepeatBase(const TStr & s,CoordHelper & coordHelper,const size_t max_len,EList<RB_RepeatBase> & repeatBases)4406 void RB_SubSA::buildRepeatBase(const TStr& s,
4407 CoordHelper& coordHelper,
4408 const size_t max_len,
4409 EList<RB_RepeatBase>& repeatBases)
4410 {
4411 if(repeat_index_.empty())
4412 return;
4413
4414 done_.fillZero();
4415
4416 EList<uint8_t> senseDominant;
4417 senseDominant.resizeExact(repeat_index_.size());
4418 senseDominant.fillZero();
4419
4420 EList<size_t> repeatStack;
4421 EList<pair<TIndexOffU, TIndexOffU> > size_table;
4422 size_table.reserveExact(repeat_index_.size() / 2 + 1);
4423 for(size_t i = 0; i < repeat_index_.size(); i++) {
4424 TIndexOffU begin = repeat_index_[i];
4425 TIndexOffU end = (i + 1 < repeat_index_.size() ? repeat_index_[i+1] : repeat_list_.size());
4426 assert_lt(begin, end);
4427 EList<TIndexOffU> positions; positions.reserveExact(end - begin);
4428 for(size_t j = begin; j < end; j++) positions.push_back(repeat_list_[j]);
4429
4430 if(!isSenseDominant(coordHelper, positions, seed_len_))
4431 continue;
4432
4433 senseDominant[i] = 1;
4434 size_table.expand();
4435 size_table.back().first = end - begin;
4436 size_table.back().second = i;
4437 }
4438 size_table.sort();
4439
4440 size_t bundle_count = 0;
4441 string tmp_str;
4442 EList<Range> tmp_ranges; tmp_ranges.resizeExact(4);
4443 EList<pair<size_t, size_t> > tmp_sort_ranges; tmp_sort_ranges.resizeExact(4);
4444 for(int64_t i = (int64_t)size_table.size() - 1; i >= 0; i--) {
4445 TIndexOffU idx = size_table[i].second;
4446 ASSERT_ONLY(TIndexOffU num = size_table[i].first);
4447 assert_lt(idx, repeat_index_.size());
4448 #ifndef NDEBUG
4449 if(idx + 1 < repeat_index_.size()) {
4450 assert_eq(repeat_index_[idx] + num, repeat_index_[idx+1]);
4451 } else {
4452 assert_eq(repeat_index_[idx] + num, repeat_list_.size());
4453 }
4454 #endif
4455 TIndexOffU saBegin = repeat_index_[idx];
4456 if(isDone(saBegin)) {
4457 assert(isDone(saBegin, num));
4458 continue;
4459 }
4460
4461 ASSERT_ONLY(size_t rb_done = 0);
4462 assert_lt(saBegin, repeat_list_.size());
4463 repeatStack.push_back(idx);
4464 while(!repeatStack.empty()) {
4465 TIndexOffU idx = repeatStack.back();
4466 assert(senseDominant[idx]);
4467 repeatStack.pop_back();
4468 assert_lt(idx, repeat_index_.size());
4469 TIndexOffU saBegin = repeat_index_[idx];
4470 TIndexOffU saEnd = (idx + 1 < repeat_index_.size() ? repeat_index_[idx + 1] : repeat_list_.size());
4471 if(isDone(saBegin)) {
4472 assert(isDone(saBegin, saEnd - saBegin));
4473 continue;
4474 }
4475
4476 TIndexOffU saElt = repeat_list_[saBegin];
4477 size_t ri = repeatBases.size();
4478 repeatBases.expand();
4479 repeatBases.back().seq = getString(s, saElt, seed_len_);
4480 repeatBases.back().nodes.clear();
4481 repeatBases.back().nodes.push_back(idx);
4482 ASSERT_ONLY(rb_done++);
4483 setDone(saBegin, saEnd - saBegin);
4484 bool left = true;
4485 while(repeatBases[ri].seq.length() <= max_len) {
4486 if(left) {
4487 tmp_str = "N";
4488 tmp_str += repeatBases[ri].seq.substr(0, seed_len_ - 1);
4489 } else {
4490 tmp_str = repeatBases[ri].seq.substr(repeatBases[ri].seq.length() - seed_len_ + 1, seed_len_ - 1);
4491 tmp_str.push_back('N');
4492 }
4493 assert_eq(tmp_str.length(), seed_len_);
4494
4495 for(size_t c = 0; c < 4; c++) {
4496 if(left) tmp_str[0] = "ACGT"[c];
4497 else tmp_str.back() = "ACGT"[c];
4498 assert_eq(tmp_str.length(), seed_len_);
4499 TIndexOffU idx = find_repeat_idx(s, tmp_str);
4500 size_t num = 0;
4501 if(idx < repeat_index_.size()) {
4502 if(idx + 1 < repeat_index_.size()) {
4503 num = repeat_index_[idx+1] - repeat_index_[idx];
4504 } else {
4505 num = repeat_list_.size() - repeat_index_[idx];
4506 }
4507 }
4508 tmp_ranges[c].first = idx;
4509 tmp_ranges[c].second = num;
4510 assert(num == 0 || num >= seed_count_);
4511 tmp_sort_ranges[c].first = num;
4512 tmp_sort_ranges[c].second = c;
4513 if(idx == repeat_index_.size() ||
4514 isDone(repeat_index_[idx]) ||
4515 !senseDominant[idx]) {
4516 #ifndef NDEBUG
4517 if(idx < repeat_index_.size()) {
4518 assert(isDone(repeat_index_[idx], num) ||
4519 !senseDominant[idx]);
4520 }
4521 #endif
4522 tmp_sort_ranges[c].first = 0;
4523 tmp_ranges[c].second = 0;
4524 }
4525 }
4526 tmp_sort_ranges.sort();
4527 if(tmp_sort_ranges[3].first < seed_count_) {
4528 if(left) {
4529 left = false;
4530 continue;
4531 } else {
4532 break;
4533 }
4534 }
4535
4536 for(size_t cc = 0; cc < 3; cc++) {
4537 assert_leq(tmp_sort_ranges[cc].first, tmp_sort_ranges[cc+1].first);
4538 if(tmp_sort_ranges[cc].first < seed_count_)
4539 continue;
4540
4541 size_t c = tmp_sort_ranges[cc].second;
4542 repeatStack.push_back(tmp_ranges[c].first);
4543 }
4544
4545 size_t c = tmp_sort_ranges[3].second;
4546 if(repeatBases[ri].seq.length() >= max_len) {
4547 assert_eq(repeatBases[ri].seq.length(), max_len);
4548 TIndexOffU idx = tmp_ranges[c].first;
4549 assert(!isDone(repeat_index_[idx]));
4550 repeatStack.push_back(idx);
4551 if(left) {
4552 left = false;
4553 continue;
4554 } else {
4555 break;
4556 }
4557 } else {
4558 TIndexOffU idx = tmp_ranges[c].first;
4559 TIndexOffU num = tmp_ranges[c].second;
4560 setDone(repeat_index_[idx], num);
4561 if(left) {
4562 repeatBases[ri].seq.insert(0, 1, "ACGT"[c]);
4563 repeatBases[ri].nodes.insert(idx, 0);
4564 } else {
4565 repeatBases[ri].seq.push_back("ACGT"[c]);
4566 repeatBases[ri].nodes.push_back(idx);
4567 }
4568 }
4569 }
4570 }
4571
4572 assert_gt(rb_done, 0);
4573 bundle_count++;
4574 }
4575
4576 cerr << "Bundle count: " << bundle_count << endl;
4577
4578 #ifndef NDEBUG
4579 {
4580 set<TIndexOffU> idx_set;
4581 for(size_t i = 0; i < repeatBases.size(); i++) {
4582 const RB_RepeatBase& repeatBase = repeatBases[i];
4583 for(size_t j = 0; j < repeatBase.nodes.size(); j++) {
4584 assert(idx_set.find(repeatBase.nodes[j]) == idx_set.end());
4585 idx_set.insert(repeatBase.nodes[j]);
4586 }
4587 }
4588 }
4589 #endif
4590
4591 #if 0
4592 {
4593 EList<pair<size_t, size_t> > kmer_table;
4594 string query;
4595 size_t total = 0, match = 0;
4596 size_t interval = 1;
4597 if(repeat_index_.size() >= 10000) {
4598 interval = repeat_index_.size() / 10000;
4599 }
4600 EList<TIndexOffU> positions;
4601 for(size_t i = 0; i < repeat_index_.size(); i += interval) {
4602 TIndexOffU saElt_idx = repeat_index_[i];
4603 TIndexOffU saElt_idx_end = (i + 1 < repeat_index_.size() ? repeat_index_[i+1] : repeat_list_.size());
4604 positions.clear();
4605 for(size_t j = saElt_idx; j < saElt_idx_end; j++) {
4606 positions.push_back(repeat_list_[j]);
4607 #ifndef NDEBUG
4608 if(j > saElt_idx) {
4609 TIndexOffU lcp_len = getLCP(s,
4610 coordHelper,
4611 positions[0],
4612 positions.back(),
4613 seed_len_);
4614 assert_geq(lcp_len, seed_len_);
4615 }
4616
4617 TIndexOffU saElt = repeat_list_[j];
4618 TIndexOffU start = coordHelper.getStart(saElt);
4619 TIndexOffU start2 = coordHelper.getStart(saElt + seed_len_ - 1);
4620 assert_eq(start, start2);
4621 #endif
4622 }
4623
4624 TIndexOffU saElt = repeat_list_[saElt_idx];
4625 size_t true_count = saElt_idx_end - saElt_idx;
4626 getString(s, saElt, seed_len_, query);
4627
4628 total++;
4629
4630 size_t count = 0;
4631 for(size_t r = 0; r < repeatBases.size(); r++) {
4632 const RB_RepeatBase& repeat = repeatBases[r];
4633 int pos = repeat.seq.find(query);
4634 if(pos != string::npos) {
4635 for(size_t j = 0; j < repeat.nodes.size(); j++) {
4636 TIndexOffU _node = repeat.nodes[j];
4637 TIndexOffU _saElt_idx = repeat_index_[_node];
4638 TIndexOffU _saElt_idx_end = (_node + 1 < repeat_index_.size() ? repeat_index_[_node+1] : repeat_list_.size());
4639 TIndexOffU _saElt = repeat_list_[_saElt_idx];
4640 string seq = getString(s, _saElt, seed_len());
4641 if(query == seq) {
4642 count += (_saElt_idx_end - _saElt_idx);
4643 }
4644 }
4645 }
4646 }
4647
4648 size_t rc_count = 0;
4649 string rc_query = reverse_complement(query);
4650 for(size_t r = 0; r < repeatBases.size(); r++) {
4651 const RB_RepeatBase& repeat = repeatBases[r];
4652 int pos = repeat.seq.find(rc_query);
4653 if(pos != string::npos) {
4654 for(size_t j = 0; j < repeat.nodes.size(); j++) {
4655 TIndexOffU _node = repeat.nodes[j];
4656 TIndexOffU _saElt_idx = repeat_index_[_node];
4657 TIndexOffU _saElt_idx_end = (_node + 1 < repeat_index_.size() ? repeat_index_[_node+1] : repeat_list_.size());
4658 TIndexOffU _saElt = repeat_list_[_saElt_idx];
4659 string seq = getString(s, _saElt, seed_len());
4660 if(rc_query == seq) {
4661 rc_count += (_saElt_idx_end - _saElt_idx);
4662 }
4663 }
4664 }
4665 }
4666
4667 if(count == true_count || rc_count == true_count) {
4668 match++;
4669 } else if(total - match <= 10) {
4670 cerr << "query: " << query << endl;
4671 cerr << "true count: " << true_count << endl;
4672 cerr << "found count: " << count << endl;
4673 cerr << "rc found count: " << rc_count << endl;
4674 cerr << endl;
4675 }
4676 }
4677
4678 cerr << "RB_SubSA: sanity check: " << match << " passed (out of " << total << ")" << endl << endl;
4679 }
4680 #endif
4681 }
4682
4683
4684 #define write_fp(x) fp.write((const char *)&(x), sizeof((x)))
4685
writeFile(ofstream & fp)4686 void RB_SubSA::writeFile(ofstream& fp)
4687 {
4688 write_fp(sa_size_);
4689 write_fp(seed_len_);
4690 write_fp(seed_count_);
4691
4692 size_t sz = temp_suffixes_.size();
4693 write_fp(sz);
4694 for(size_t i = 0; i < sz; i++) {
4695 write_fp(temp_suffixes_[i]);
4696 }
4697
4698 sz = repeat_index_.size();
4699 write_fp(sz);
4700 for(size_t i = 0; i < sz; i++) {
4701 write_fp(repeat_index_[i]);
4702 }
4703
4704 sz = done_.size();
4705 write_fp(sz);
4706 for(size_t i = 0; i < sz; i++) {
4707 write_fp(done_[i]);
4708 }
4709 }
4710
4711 #define read_fp(x) fp.read((char *)&(x), sizeof((x)))
4712
readFile(ifstream & fp)4713 void RB_SubSA::readFile(ifstream& fp)
4714 {
4715 TIndexOffU val;
4716
4717 read_fp(val);
4718 rt_assert_eq(val, sa_size_);
4719
4720 read_fp(val);
4721 rt_assert_eq(val, seed_len_);
4722
4723 read_fp(val);
4724 rt_assert_eq(val, seed_count_);
4725
4726 size_t sz;
4727 read_fp(sz);
4728 temp_suffixes_.resizeExact(sz);
4729 for(size_t i = 0; i < sz; i++) {
4730 read_fp(temp_suffixes_[i]);
4731 }
4732
4733 size_t val_sz;
4734
4735 // repeat_index_
4736 read_fp(val_sz);
4737 repeat_index_.resizeExact(val_sz);
4738 for(size_t i = 0; i < val_sz; i++) {
4739 read_fp(repeat_index_[i]);
4740 }
4741
4742 // done
4743 read_fp(val_sz);
4744 done_.resizeExact(val_sz);
4745 for(size_t i = 0; i < val_sz; i++) {
4746 read_fp(done_[i]);
4747 }
4748
4749 }
4750
4751
4752 /****************************/
4753 template class RepeatBuilder<SString<char> >;
4754 template void dump_tstr(const SString<char>& );
4755 template bool compareRepeatCoordByJoinedOff(const RepeatCoord<TIndexOffU>& , const RepeatCoord<TIndexOffU>&);
4756