1 // -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-OA
2 //
3 // Copyright 2013-2018, Julian Catchen <jcatchen@illinois.edu>
4 //
5 // This file is part of Stacks.
6 //
7 // Stacks is free software: you can redistribute it and/or modify
8 // it under the terms of the GNU General Public License as published by
9 // the Free Software Foundation, either version 3 of the License, or
10 // (at your option) any later version.
11 //
12 // Stacks is distributed in the hope that it will be useful,
13 // but WITHOUT ANY WARRANTY; without even the implied warranty of
14 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 // GNU General Public License for more details.
16 //
17 // You should have received a copy of the GNU General Public License
18 // along with Stacks. If not, see <http://www.gnu.org/licenses/>.
19 //
20
21 //
22 // locus.cc -- routines for the Locus class and its derivatives.
23 //
24 #include "locus.h"
25
26 #include "utils.h"
27
Locus(const Locus & templ)28 Locus::Locus(const Locus &templ)
29 {
30 this->id = templ.id;
31 this->sample_id = templ.sample_id;
32 this->depth = templ.depth;
33 this->len = templ.len;
34 this->blacklisted = templ.blacklisted;
35 this->deleveraged = templ.deleveraged;
36 this->lumberjackstack = templ.lumberjackstack;
37
38 this->model = NULL;
39 this->con = NULL;
40
41 if (templ.model != NULL) {
42 this->model = new char [templ.len + 1];
43 strcpy(this->model, templ.model);
44 }
45 if (templ.con != NULL) {
46 this->con = new char [templ.len + 1];
47 strcpy(this->con, templ.con);
48 }
49
50 for (uint i = 0; i < templ.comp.size(); i++) {
51 char *c = new char [strlen(templ.comp[i]) + 1];
52 strcpy(c, templ.comp[i]);
53 this->comp.push_back(c);
54 }
55 for (uint i = 0; i < templ.reads.size(); i++) {
56 char *c = new char [strlen(templ.reads[i]) + 1];
57 strcpy(c, templ.reads[i]);
58 this->reads.push_back(c);
59 }
60 for (uint i = 0; i < templ.snps.size(); i++) {
61 SNP *s = new SNP(*templ.snps[i]);
62 this->snps.push_back(s);
63 }
64
65 this->comp_cnt = vector<uint>(templ.comp_cnt);
66 this->comp_type = vector<read_type>(templ.comp_type);
67 this->loc = templ.loc;
68 this->alleles = map<string, int>(templ.alleles);
69 this->strings = vector<pair<allele_type, string>>(templ.strings);
70
71 }
72
73 int
snp_index(uint col) const74 Locus::snp_index(uint col) const
75 {
76 for (uint i = 0; i < this->snps.size(); i++)
77 if (this->snps[i]->col == col)
78 return i;
79 DOES_NOT_HAPPEN;
80 return -1;
81 }
82
83 int
add_consensus(const char * seq)84 Locus::add_consensus(const char *seq)
85 {
86 if (this->con != NULL)
87 delete [] this->con;
88
89 this->len = strlen(seq);
90 this->con = new char[this->len + 1];
91 strcpy(this->con, seq);
92
93 return 0;
94 }
95
96 int
add_model(const char * seq)97 Locus::add_model(const char *seq)
98 {
99 if (this->model != NULL)
100 delete [] this->model;
101
102 this->model = new char[this->len + 1];
103 strncpy(this->model, seq, this->len);
104 this->model[this->len] = '\0';
105
106 return 0;
107 }
108
109 int
populate_alleles()110 Locus::populate_alleles()
111 {
112 vector<SNP *>::iterator i;
113 map<string, int>::iterator j;
114 string s;
115 uint k;
116
117 if (this->len > strlen(this->con))
118 cerr << "Recorded locus->len: " << this->len << "; consensus length: " << strlen(this->con) << "\n";
119
120 //
121 // Is this effective?
122 //
123 for (uint n = 0; n < this->strings.size(); n++) {
124 this->strings[n].first.clear();
125 this->strings[n].second.clear();
126 }
127 this->strings.clear();
128
129 if (this->snps.size() == 0) {
130 this->strings.push_back(make_pair("consensus", this->con));
131 return 0;
132 }
133
134 for (j = this->alleles.begin(); j != this->alleles.end(); j++) {
135 s = this->con;
136 k = 0;
137
138 for (i = this->snps.begin(); i != this->snps.end(); i++) {
139 if ((*i)->type == snp_type_het && (*i)->col < this->len && k < j->first.length()) {
140 s.replace((*i)->col, 1, 1, j->first[k]);
141 k++;
142 }
143 }
144
145 this->strings.push_back(make_pair(j->first, s));
146 }
147
148 return 0;
149 }
150
151 int
adjust_snps_for_gaps(Cigar & cigar,Locus * loc)152 adjust_snps_for_gaps(Cigar &cigar, Locus *loc)
153 {
154 uint size = cigar.size();
155 char op;
156 uint dist, bp, stop, offset, snp_index;
157
158 bp = 0;
159 offset = 0;
160 snp_index = 0;
161
162 for (uint i = 0; i < size; i++) {
163 op = cigar[i].first;
164 dist = cigar[i].second;
165
166 switch(op) {
167 case 'D':
168 offset += dist;
169 break;
170 case 'I':
171 case 'M':
172 case 'S':
173 stop = bp + dist;
174 while (bp < stop && snp_index < loc->snps.size()) {
175 if (loc->snps[snp_index]->col == bp) {
176 loc->snps[snp_index]->col += offset;
177 snp_index++;
178 }
179 bp++;
180 }
181 break;
182 default:
183 break;
184 }
185 }
186
187 return 0;
188 }
189
190 int
adjust_and_add_snps_for_gaps(Cigar & cigar,Locus * loc)191 adjust_and_add_snps_for_gaps(Cigar &cigar, Locus *loc)
192 {
193 uint size = cigar.size();
194 char op;
195 uint dist, bp, new_bp, stop, snp_cnt;
196 SNP *s;
197
198 bp = 0;
199 new_bp = 0;
200 snp_cnt = loc->snps.size();
201
202 vector<SNP *> snps;
203
204 for (uint i = 0; i < size; i++) {
205 op = cigar[i].first;
206 dist = cigar[i].second;
207
208 switch(op) {
209 case 'D':
210 stop = new_bp + dist;
211 while (new_bp < stop) {
212 s = new SNP;
213 s->col = new_bp;
214 s->type = snp_type_unk;
215 s->rank_1 = 'N';
216 snps.push_back(s);
217 new_bp++;
218 }
219 break;
220 case 'I':
221 case 'M':
222 case 'S':
223 stop = bp + dist > snp_cnt ? snp_cnt : bp + dist;
224 while (bp < stop) {
225 loc->snps[bp]->col = new_bp;
226 snps.push_back(loc->snps[bp]);
227 bp++;
228 new_bp++;
229 }
230 break;
231 default:
232 break;
233 }
234 }
235
236 loc->snps.clear();
237
238 for (uint i = 0; i < snps.size(); i++)
239 loc->snps.push_back(snps[i]);
240
241 return 0;
242 }
243
244 int
remove_snps_from_gaps(Cigar & cigar,Locus * loc)245 remove_snps_from_gaps(Cigar &cigar, Locus *loc)
246 {
247 uint size = cigar.size();
248 char op;
249 uint dist, bp, new_bp, stop, snp_cnt;
250
251 bp = 0;
252 new_bp = 0;
253 snp_cnt = loc->snps.size();
254
255 vector<SNP *> snps;
256
257 for (uint i = 0; i < size; i++) {
258 op = cigar[i].first;
259 dist = cigar[i].second;
260
261 switch(op) {
262 case 'D':
263 stop = bp + dist;
264 while (bp < stop) {
265 delete loc->snps[bp];
266 bp++;
267 }
268 break;
269 case 'I':
270 case 'M':
271 case 'S':
272 stop = bp + dist > snp_cnt ? snp_cnt : bp + dist;
273 while (bp < stop) {
274 loc->snps[bp]->col = new_bp;
275 snps.push_back(loc->snps[bp]);
276 bp++;
277 new_bp++;
278 }
279 break;
280 default:
281 break;
282 }
283 }
284
285 loc->snps.clear();
286
287 for (uint i = 0; i < snps.size(); i++)
288 loc->snps.push_back(snps[i]);
289
290 return 0;
291 }
292
QLocus(const QLocus & other)293 QLocus::QLocus(const QLocus &other): Locus(other)
294 {
295 for (auto it = other.matches.begin(); it != other.matches.end(); it++) {
296 Match *m = *it;
297 this->matches.push_back(new Match(*m));
298 }
299 }
300
~QLocus()301 QLocus::~QLocus()
302 {
303 for (auto it = this->matches.begin(); it != this->matches.end(); it++)
304 delete *it;
305 }
306
307 int
add_match(int catalog_id,allele_type cat_type,allele_type query_type,int distance)308 QLocus::add_match(int catalog_id, allele_type cat_type, allele_type query_type, int distance)
309 {
310 Match *m = new Match;
311
312 m->cat_id = catalog_id;
313 m->cat_type = cat_type;
314 m->query_type = query_type;
315 m->dist = distance;
316
317 this->matches.push_back(m);
318
319 return 0;
320 }
321
322 int
add_match(int catalog_id,allele_type cat_type,allele_type query_type,int distance,string cigar)323 QLocus::add_match(int catalog_id, allele_type cat_type, allele_type query_type, int distance, string cigar)
324 {
325 Match *m = new Match;
326
327 m->cat_id = catalog_id;
328 m->cat_type = cat_type;
329 m->query_type = query_type;
330 m->dist = distance;
331 m->cigar = cigar;
332
333 this->matches.push_back(m);
334
335 return 0;
336 }
337
338 int
add_match(int catalog_id,allele_type cat_type)339 QLocus::add_match(int catalog_id, allele_type cat_type)
340 {
341 Match *m = new Match;
342
343 m->cat_id = catalog_id;
344 m->cat_type = cat_type;
345 m->query_type = "";
346 m->dist = 0;
347
348 this->matches.push_back(m);
349
350 return 0;
351 }
352
353 int
clear_matches()354 QLocus::clear_matches()
355 {
356 vector<Match *>::iterator it;
357
358 for (it = this->matches.begin(); it != this->matches.end(); it++)
359 delete *it;
360 this->matches.clear();
361
362 return 0;
363 }
364
365 void
clear()366 CLocAlnSet::clear() {
367 id_= -1;
368 aln_pos_.clear();
369 ref_ = DNASeq4();
370 mpopi_ = NULL;
371 reads_.clear();
372 reads_per_sample_.clear();
373 }
374
n_samples() const375 size_t CLocReadSet::n_samples() const {
376 vector<size_t> n_reads_per_sample (mpopi_->samples().size());
377 for (auto& r : reads_)
378 ++n_reads_per_sample[r.sample];
379 size_t n_samples = 0;
380 for (size_t n_s_reads : n_reads_per_sample)
381 if (n_s_reads > 0)
382 ++n_samples;
383 return n_samples;
384 }
385
reinit(int id,const PhyLoc & aln_pos,const MetaPopInfo * mpopi)386 void CLocAlnSet::reinit(int id, const PhyLoc& aln_pos, const MetaPopInfo* mpopi) {
387 clear();
388 id_ = id;
389 aln_pos_ = aln_pos;
390 mpopi_ = mpopi;
391 reads_per_sample_.resize(mpopi->samples().size());
392 }
393
recompute_consensus()394 void CLocAlnSet::recompute_consensus() {
395 Counts<Nt4> cnts;
396 size_t i = 0;
397 for (CLocAlnSet::site_iterator site (*this); bool(site); ++site, ++i) {
398 site.counts(cnts);
399 pair<size_t,Nt4> best_nt = cnts.sorted()[0];
400 if (best_nt.first > 0)
401 ref_.set(i, best_nt.second);
402 else
403 ref_.set(i, Nt4::n);
404 }
405 assert(i == ref_.length());
406 }
407
hard_clip_right_Ns()408 void CLocAlnSet::hard_clip_right_Ns() {
409 assert(!ref_.empty());
410 //assert(ref_[0] != Nt4::n); // consensus must have been computed, and the first nt is in the cutsite.
411 if (ref_[0] == Nt4::n)
412 cerr << "WARNING: Some loci seemingly do not start with a cutsite; check your input reads. ("
413 << id_ << ")\n";
414
415 size_t to_clip = 0;
416 DNASeq4::iterator nt = ref_.end();
417 while(nt != ref_.begin() && *--nt == Nt4::n)
418 ++to_clip;
419 assert(to_clip < ref_.length());
420
421 ref_.resize(ref_.length() - to_clip);
422
423 for (SAlnRead& r : reads_) {
424 if (r.cigar.empty() || r.cigar.back().first != 'M' || r.cigar.back().second <= to_clip)
425 DOES_NOT_HAPPEN;
426 r.seq.resize(ref_.length());
427 r.cigar.back().second -= to_clip;
428 }
429 }
430
431 void
sort_by_read_name()432 CLocAlnSet::sort_by_read_name()
433 {
434 sort(this->reads_.begin(), this->reads_.end(),
435 [] (const SAlnRead& r1, const SAlnRead& r2) { return r1.name < r2.name; }
436 );
437
438 for (auto& s : reads_per_sample_)
439 s.clear();
440 for (size_t i=0; i<reads_.size(); ++i)
441 reads_per_sample_[reads_[i].sample].push_back(i);
442 }
443
444 void
sort_by_alignment_offset()445 CLocAlnSet::sort_by_alignment_offset()
446 {
447 auto aln_offset = [] (const Cigar& c) ->size_t {
448 assert(!c.empty());
449 assert(cigar_is_MDI(c));
450 if (c[0].first == 'D') {
451 return c[0].second;
452 } else if (c[0].first == 'I'
453 && c.size() >= 2
454 && c[1].first == 'D')
455 {
456 return c[1].second;
457 } else {
458 return 0;
459 }
460 };
461
462 sort(this->reads_.begin(), this->reads_.end(),
463 [&aln_offset] (const SAlnRead& r1, const SAlnRead& r2) {
464 return aln_offset(r1.cigar) < aln_offset(r2.cigar);
465 });
466
467 for (auto& s : reads_per_sample_)
468 s.clear();
469 for (size_t i=0; i<reads_.size(); ++i)
470 reads_per_sample_[reads_[i].sample].push_back(i);
471 }
472
473 void
merge_paired_reads()474 CLocAlnSet::merge_paired_reads()
475 {
476 //
477 // Sort reads by name. Paired reads should have the same name but end with
478 // respectively "/1" and "/2".
479 //
480 sort_by_read_name();
481
482 // Merge paired reads.
483 for (auto r1 = this->reads_.begin(); r1 != this->reads_.end(); ++r1) {
484 auto r2 = r1;
485 ++r2;
486
487 if (r2 == this->reads_.end())
488 break;
489
490 const string& n1 = r1->name;
491 const string& n2 = r2->name;
492 const size_t l = n1.length();
493
494 if (n2.length() == l && l >= 2 &&
495 n1[l-2] == '/' && n1[l-1] == '1' &&
496 n2[l-2] == '/' && n2[l-1] == '2' &&
497 n1.compare(0, l-2, n2, 0, l-2) == 0) {
498
499 // r1 and r2 are paired, merge them.
500 //assert(r1->sample == r2->sample); // xxx Fix process_radtags.
501 if (r1->sample == r2->sample)
502 *r1 = SAlnRead(AlnRead::merger_of(move(*r1), move(*r2)), r1->sample);
503
504 assert(cigar_length_ref(r1->cigar) == ref_.length());
505
506 // Mark r2 for removal and skip it.
507 r2->seq.clear();
508 ++r1;
509 if (r1 == this->reads_.end())
510 break;
511 }
512 }
513
514 // Remove emptied reads.
515 reads_.erase(std::remove_if(reads_.begin(),
516 reads_.end(),
517 [](const Read& r) { return r.seq.empty(); }
518 ),
519 reads_.end());
520
521 // Refresh `reads_per_sample_`.
522 reads_per_sample_ = vector<vector<size_t>>(mpopi().samples().size());
523 for (size_t i=0; i<reads_.size(); ++i)
524 reads_per_sample_[reads_[i].sample].push_back(i);
525 }
526
operator <<(ostream & os,const CLocAlnSet & loc)527 ostream& operator<< (ostream& os, const CLocAlnSet& loc) {
528 os << "ref\t.\t" << loc.ref().str();
529 for (auto& r : loc.reads_)
530 os << "\n" << r.name << "\t" << loc.mpopi().samples()[r.sample].name << "\t" << r.aln();
531 return os;
532 }
533
534 void
remove_unmerged_reads(ostream * log)535 CLocAlnSet::remove_unmerged_reads(ostream* log)
536 {
537 if (log != NULL)
538 *log << "BEGIN unpaired_reads\n";
539
540 for (SAlnRead& r : reads_) {
541 if (r.name.back() != 'm') {
542 r.seq.clear();
543 if (log != NULL)
544 *log << "rm_unpaired\t" << r.name << '\n';
545 }
546 }
547
548 // Remove emptied reads.
549 stacks_erase_if(reads_, [](const Read& r){return r.seq.empty();} );
550
551 // Refresh `reads_per_sample_`.
552 reads_per_sample_ = vector<vector<size_t>>(mpopi().samples().size());
553 for (size_t i=0; i<reads_.size(); ++i)
554 reads_per_sample_[reads_[i].sample].push_back(i);
555
556 if (log != NULL)
557 *log << "END unpaired_reads\n";
558 }
559
560 void
remove_pcr_duplicates(vector<size_t> * clone_size_distrib,ostream * log)561 CLocAlnSet::remove_pcr_duplicates(vector<size_t>* clone_size_distrib, ostream* log)
562 {
563 if (log != NULL)
564 *log << "BEGIN pcr_duplicates\n";
565
566 //
567 // Sort reads by (I) sample; (II) insert size; (III, for stability) name.
568 //
569 for (SAlnRead& r : this->reads_) {
570 assert(!r.cigar.empty());
571 assert(cigar_is_MDI(r.cigar));
572 // assert(r.cigar.front().first == 'M'); // This is the cutsite. //FIXME:
573 cigar_canonicalize_MDI_order(r.cigar);
574 }
575 auto compute_insert_length = [] (const Cigar& c) -> size_t {
576 // rem. We assume the asserts above.
577 size_t len = cigar_length_ref(c);
578 auto last = c.rbegin();
579 if (last->first == 'I') {
580 len += last->second;
581 ++last;
582 }
583 if (last != c.rend() && last->first == 'D')
584 len -= last->second;
585 return len;
586 };
587 sort(this->reads_.begin(), this->reads_.end(),
588 [&compute_insert_length](const SAlnRead& r1, const SAlnRead& r2) {
589 if (r1.sample != r2.sample)
590 return r1.sample < r2.sample;
591 size_t i1 = compute_insert_length(r1.cigar);
592 size_t i2 = compute_insert_length(r2.cigar);
593 if (i1 != i2)
594 return i1 < i2;
595 return r1.name.compare(r2.name) < 0;
596 });
597
598 // Remove reads that have the same insert length.
599 vector<SAlnRead>::iterator r = this->reads_.begin();
600 assert(r != this->reads_.end());
601 size_t len = compute_insert_length(r->cigar);
602 while (r != this->reads_.end()) {
603 auto group = r;
604 size_t group_len = len;
605 ++r;
606 while (r != this->reads_.end()) {
607 len = compute_insert_length(r->cigar);
608 if (r->sample != group->sample || len != group_len)
609 break;
610 ++r;
611 }
612 size_t clone_size = r - group;
613 if (clone_size > 1)
614 for (auto r2=group+1; r2!=r; ++r2)
615 r2->seq.clear();
616 if (clone_size_distrib != NULL) {
617 if(clone_size_distrib->size() < clone_size + 1) {
618 clone_size_distrib->resize(clone_size + 1);
619 }
620 ++clone_size_distrib->at(clone_size);
621 }
622 if (log != NULL) {
623 *log << "pcr_dupls\t" << mpopi().samples()[group->sample].name
624 << '\t';
625 for (auto r2=group; r2!=r; ++r2)
626 *log << ',' << r2->name;;
627 *log << '\t' << group_len << '\n';
628 }
629 }
630
631 // Remove emptied reads.
632 stacks_erase_if(reads_, [](const Read& r){return r.seq.empty();} );
633
634 // Refresh `reads_per_sample_`.
635 reads_per_sample_ = vector<vector<size_t>>(mpopi().samples().size());
636 for (size_t i=0; i<reads_.size(); ++i)
637 reads_per_sample_[reads_[i].sample].push_back(i);
638
639 if (log != NULL)
640 *log << "END pcr_duplicates\n";
641 }
642
643 CLocAlnSet
juxtapose(CLocAlnSet && left,CLocAlnSet && right,long offset)644 CLocAlnSet::juxtapose(CLocAlnSet&& left, CLocAlnSet&& right, long offset)
645 {
646 assert(left.id() == right.id());
647 assert(left.pos() == right.pos());
648 assert(&left.mpopi() == &right.mpopi());
649
650 size_t left_ref_len = left.ref().length();
651 CLocAlnSet merged (move(left));
652
653 //
654 // N.B. Oct. 2017:
655 //
656 // It actually possible that
657 // ``` size_t(-offset) > left.ref().length() ```
658 // happens legitimately: if inserts smaller than the read size have been
659 // sequenced, the paired-end contig may end upstream of the forward reads,
660 // in the barcode/adapter region. In this case, we don't do anything special,
661 // as with the current approach the left of the paired-end contig (the
662 // `right` matrix) is trimmed anyway.
663 //
664 // Similarly, it is possible that
665 // ``` size_t(-offset) > right.ref().length() ```
666 // happens legitimately, for the same reason. In this case, we just move the
667 // paired-end reads to the FW object, soft-clipping them entirely.
668 //
669 // At the moment, we don't do any special trimming of the small-insert reads.
670 //
671 if (offset < 0 && size_t(-offset) > right.ref().length()) {
672 for (SAlnRead& r : right.reads_) {
673 r.cigar.assign({{'D', merged.ref().length()}, {'I', r.seq.length()}});
674 merged.add(move(r));
675 }
676 left.clear();
677 right.clear();
678 return merged;
679 }
680
681 // Extend the reference sequence.
682 if (offset >= 0) {
683 for (long i=0; i<offset; ++i)
684 merged.ref_.push_back(Nt4::n);
685 merged.ref_.append(right.ref().begin(), right.ref().end());
686 } else {
687 auto right_itr = right.ref().begin();
688 for (long i=0; i<-offset; ++i) {
689 assert(right_itr != right.ref().end());
690 ++right_itr;
691 }
692 merged.ref_.append(right_itr, right.ref().end());
693 }
694
695 // Extend the left reads.
696 for (SAlnRead& r : merged.reads_) {
697 cigar_extend_right(r.cigar, right.ref().length() + offset);
698 assert(cigar_length_ref(r.cigar) == merged.ref().length());
699 }
700
701 // Extend/Trim & add the right reads.
702 for (SAlnRead& r : right.reads_) {
703 if (offset >= 0) {
704 cigar_extend_left(r.cigar, left_ref_len + offset);
705 } else {
706 cigar_trim_left(r.cigar, -offset);
707 assert(cigar_length_query(r.cigar) == r.seq.length());
708 cigar_extend_left(r.cigar, left_ref_len);
709 }
710 merged.add(move(r));
711 }
712 right.reads_ = vector<SAlnRead>();
713 right.reads_per_sample_ = vector<vector<size_t>>(right.mpopi().samples().size());
714
715 left.clear();
716 right.clear();
717 return merged;
718 }
719