1 // -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-OA
2 //
3 // Copyright 2013-2018, Julian Catchen <jcatchen@illinois.edu>
4 //
5 // This file is part of Stacks.
6 //
7 // Stacks is free software: you can redistribute it and/or modify
8 // it under the terms of the GNU General Public License as published by
9 // the Free Software Foundation, either version 3 of the License, or
10 // (at your option) any later version.
11 //
12 // Stacks is distributed in the hope that it will be useful,
13 // but WITHOUT ANY WARRANTY; without even the implied warranty of
14 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 // GNU General Public License for more details.
16 //
17 // You should have received a copy of the GNU General Public License
18 // along with Stacks.  If not, see <http://www.gnu.org/licenses/>.
19 //
20 
21 //
22 // locus.cc -- routines for the Locus class and its derivatives.
23 //
24 #include "locus.h"
25 
26 #include "utils.h"
27 
Locus(const Locus & templ)28 Locus::Locus(const Locus &templ)
29 {
30     this->id              = templ.id;
31     this->sample_id       = templ.sample_id;
32     this->depth           = templ.depth;
33     this->len             = templ.len;
34     this->blacklisted     = templ.blacklisted;
35     this->deleveraged     = templ.deleveraged;
36     this->lumberjackstack = templ.lumberjackstack;
37 
38     this->model = NULL;
39     this->con   = NULL;
40 
41     if (templ.model != NULL) {
42         this->model = new char [templ.len + 1];
43         strcpy(this->model, templ.model);
44     }
45     if (templ.con != NULL) {
46         this->con = new char [templ.len + 1];
47         strcpy(this->con, templ.con);
48     }
49 
50     for (uint i = 0; i < templ.comp.size(); i++) {
51         char *c = new char [strlen(templ.comp[i]) + 1];
52         strcpy(c, templ.comp[i]);
53         this->comp.push_back(c);
54     }
55     for (uint i = 0; i < templ.reads.size(); i++) {
56         char *c = new char [strlen(templ.reads[i]) + 1];
57         strcpy(c, templ.reads[i]);
58         this->reads.push_back(c);
59     }
60     for (uint i = 0; i < templ.snps.size(); i++) {
61         SNP *s = new SNP(*templ.snps[i]);
62         this->snps.push_back(s);
63     }
64 
65     this->comp_cnt  = vector<uint>(templ.comp_cnt);
66     this->comp_type = vector<read_type>(templ.comp_type);
67     this->loc       = templ.loc;
68     this->alleles   = map<string, int>(templ.alleles);
69     this->strings   = vector<pair<allele_type, string>>(templ.strings);
70 
71 }
72 
73 int
snp_index(uint col) const74 Locus::snp_index(uint col) const
75 {
76     for (uint i = 0; i < this->snps.size(); i++)
77         if (this->snps[i]->col == col)
78             return i;
79     DOES_NOT_HAPPEN;
80     return -1;
81 }
82 
83 int
add_consensus(const char * seq)84 Locus::add_consensus(const char *seq)
85 {
86     if (this->con != NULL)
87         delete [] this->con;
88 
89     this->len = strlen(seq);
90     this->con = new char[this->len + 1];
91     strcpy(this->con, seq);
92 
93     return 0;
94 }
95 
96 int
add_model(const char * seq)97 Locus::add_model(const char *seq)
98 {
99     if (this->model != NULL)
100         delete [] this->model;
101 
102     this->model = new char[this->len + 1];
103     strncpy(this->model, seq, this->len);
104     this->model[this->len] = '\0';
105 
106     return 0;
107 }
108 
109 int
populate_alleles()110 Locus::populate_alleles()
111 {
112     vector<SNP *>::iterator  i;
113     map<string, int>::iterator j;
114     string s;
115     uint   k;
116 
117     if (this->len > strlen(this->con))
118         cerr << "Recorded locus->len: " << this->len << "; consensus length: " << strlen(this->con) << "\n";
119 
120     //
121     // Is this effective?
122     //
123     for (uint n = 0; n < this->strings.size(); n++) {
124         this->strings[n].first.clear();
125         this->strings[n].second.clear();
126     }
127     this->strings.clear();
128 
129     if (this->snps.size() == 0) {
130         this->strings.push_back(make_pair("consensus", this->con));
131         return 0;
132     }
133 
134     for (j = this->alleles.begin(); j != this->alleles.end(); j++) {
135         s = this->con;
136         k = 0;
137 
138         for (i = this->snps.begin(); i != this->snps.end(); i++) {
139             if ((*i)->type == snp_type_het && (*i)->col < this->len && k < j->first.length()) {
140                 s.replace((*i)->col, 1, 1, j->first[k]);
141                 k++;
142             }
143         }
144 
145         this->strings.push_back(make_pair(j->first, s));
146     }
147 
148     return 0;
149 }
150 
151 int
adjust_snps_for_gaps(Cigar & cigar,Locus * loc)152 adjust_snps_for_gaps(Cigar &cigar, Locus *loc)
153 {
154     uint   size = cigar.size();
155     char   op;
156     uint   dist, bp, stop, offset, snp_index;
157 
158     bp        = 0;
159     offset    = 0;
160     snp_index = 0;
161 
162     for (uint i = 0; i < size; i++)  {
163         op   = cigar[i].first;
164         dist = cigar[i].second;
165 
166         switch(op) {
167         case 'D':
168             offset += dist;
169             break;
170         case 'I':
171         case 'M':
172         case 'S':
173             stop = bp + dist;
174             while (bp < stop && snp_index < loc->snps.size()) {
175                 if (loc->snps[snp_index]->col == bp) {
176                     loc->snps[snp_index]->col += offset;
177                     snp_index++;
178                 }
179                 bp++;
180             }
181             break;
182         default:
183             break;
184         }
185     }
186 
187     return 0;
188 }
189 
190 int
adjust_and_add_snps_for_gaps(Cigar & cigar,Locus * loc)191 adjust_and_add_snps_for_gaps(Cigar &cigar, Locus *loc)
192 {
193     uint   size = cigar.size();
194     char   op;
195     uint   dist, bp, new_bp, stop, snp_cnt;
196     SNP   *s;
197 
198     bp      = 0;
199     new_bp  = 0;
200     snp_cnt = loc->snps.size();
201 
202     vector<SNP *> snps;
203 
204     for (uint i = 0; i < size; i++)  {
205         op   = cigar[i].first;
206         dist = cigar[i].second;
207 
208         switch(op) {
209         case 'D':
210             stop = new_bp + dist;
211             while (new_bp < stop) {
212                 s = new SNP;
213                 s->col    = new_bp;
214                 s->type   = snp_type_unk;
215                 s->rank_1 = 'N';
216                 snps.push_back(s);
217                 new_bp++;
218             }
219             break;
220         case 'I':
221         case 'M':
222         case 'S':
223             stop = bp + dist > snp_cnt ? snp_cnt : bp + dist;
224             while (bp < stop) {
225                 loc->snps[bp]->col = new_bp;
226                 snps.push_back(loc->snps[bp]);
227                 bp++;
228                 new_bp++;
229             }
230             break;
231         default:
232             break;
233         }
234     }
235 
236     loc->snps.clear();
237 
238     for (uint i = 0; i < snps.size(); i++)
239         loc->snps.push_back(snps[i]);
240 
241     return 0;
242 }
243 
244 int
remove_snps_from_gaps(Cigar & cigar,Locus * loc)245 remove_snps_from_gaps(Cigar &cigar, Locus *loc)
246 {
247     uint   size = cigar.size();
248     char   op;
249     uint   dist, bp, new_bp, stop, snp_cnt;
250 
251     bp      = 0;
252     new_bp  = 0;
253     snp_cnt = loc->snps.size();
254 
255     vector<SNP *> snps;
256 
257     for (uint i = 0; i < size; i++)  {
258         op   = cigar[i].first;
259         dist = cigar[i].second;
260 
261         switch(op) {
262         case 'D':
263             stop = bp + dist;
264             while (bp < stop) {
265                 delete loc->snps[bp];
266                 bp++;
267             }
268             break;
269         case 'I':
270         case 'M':
271         case 'S':
272             stop = bp + dist > snp_cnt ? snp_cnt : bp + dist;
273             while (bp < stop) {
274                 loc->snps[bp]->col = new_bp;
275                 snps.push_back(loc->snps[bp]);
276                 bp++;
277                 new_bp++;
278             }
279             break;
280         default:
281             break;
282         }
283     }
284 
285     loc->snps.clear();
286 
287     for (uint i = 0; i < snps.size(); i++)
288         loc->snps.push_back(snps[i]);
289 
290     return 0;
291 }
292 
QLocus(const QLocus & other)293 QLocus::QLocus(const QLocus &other): Locus(other)
294 {
295     for (auto it = other.matches.begin(); it != other.matches.end(); it++) {
296         Match *m = *it;
297         this->matches.push_back(new Match(*m));
298     }
299 }
300 
~QLocus()301 QLocus::~QLocus()
302 {
303     for (auto it = this->matches.begin(); it != this->matches.end(); it++)
304         delete *it;
305 }
306 
307 int
add_match(int catalog_id,allele_type cat_type,allele_type query_type,int distance)308 QLocus::add_match(int catalog_id, allele_type cat_type, allele_type query_type, int distance)
309 {
310     Match *m = new Match;
311 
312     m->cat_id     = catalog_id;
313     m->cat_type   = cat_type;
314     m->query_type = query_type;
315     m->dist       = distance;
316 
317     this->matches.push_back(m);
318 
319     return 0;
320 }
321 
322 int
add_match(int catalog_id,allele_type cat_type,allele_type query_type,int distance,string cigar)323 QLocus::add_match(int catalog_id, allele_type cat_type, allele_type query_type, int distance, string cigar)
324 {
325     Match *m = new Match;
326 
327     m->cat_id     = catalog_id;
328     m->cat_type   = cat_type;
329     m->query_type = query_type;
330     m->dist       = distance;
331     m->cigar      = cigar;
332 
333     this->matches.push_back(m);
334 
335     return 0;
336 }
337 
338 int
add_match(int catalog_id,allele_type cat_type)339 QLocus::add_match(int catalog_id, allele_type cat_type)
340 {
341     Match *m = new Match;
342 
343     m->cat_id     = catalog_id;
344     m->cat_type   = cat_type;
345     m->query_type = "";
346     m->dist       = 0;
347 
348     this->matches.push_back(m);
349 
350     return 0;
351 }
352 
353 int
clear_matches()354 QLocus::clear_matches()
355 {
356     vector<Match *>::iterator it;
357 
358     for (it = this->matches.begin(); it != this->matches.end(); it++)
359         delete *it;
360     this->matches.clear();
361 
362     return 0;
363 }
364 
365 void
clear()366 CLocAlnSet::clear() {
367     id_= -1;
368     aln_pos_.clear();
369     ref_ = DNASeq4();
370     mpopi_ = NULL;
371     reads_.clear();
372     reads_per_sample_.clear();
373 }
374 
n_samples() const375 size_t CLocReadSet::n_samples() const {
376     vector<size_t> n_reads_per_sample (mpopi_->samples().size());
377     for (auto& r : reads_)
378         ++n_reads_per_sample[r.sample];
379     size_t n_samples = 0;
380     for (size_t n_s_reads : n_reads_per_sample)
381         if (n_s_reads > 0)
382             ++n_samples;
383     return n_samples;
384 }
385 
reinit(int id,const PhyLoc & aln_pos,const MetaPopInfo * mpopi)386 void CLocAlnSet::reinit(int id, const PhyLoc& aln_pos, const MetaPopInfo* mpopi) {
387     clear();
388     id_ = id;
389     aln_pos_ = aln_pos;
390     mpopi_ = mpopi;
391     reads_per_sample_.resize(mpopi->samples().size());
392 }
393 
recompute_consensus()394 void CLocAlnSet::recompute_consensus() {
395     Counts<Nt4> cnts;
396     size_t i = 0;
397     for (CLocAlnSet::site_iterator site (*this); bool(site); ++site, ++i) {
398         site.counts(cnts);
399         pair<size_t,Nt4> best_nt = cnts.sorted()[0];
400         if (best_nt.first > 0)
401             ref_.set(i, best_nt.second);
402         else
403             ref_.set(i, Nt4::n);
404     }
405     assert(i == ref_.length());
406 }
407 
hard_clip_right_Ns()408 void CLocAlnSet::hard_clip_right_Ns() {
409     assert(!ref_.empty());
410     //assert(ref_[0] != Nt4::n); // consensus must have been computed, and the first nt is in the cutsite.
411     if (ref_[0] == Nt4::n)
412         cerr << "WARNING: Some loci seemingly do not start with a cutsite; check your input reads. ("
413              << id_ << ")\n";
414 
415     size_t to_clip = 0;
416     DNASeq4::iterator nt = ref_.end();
417     while(nt != ref_.begin() && *--nt == Nt4::n)
418         ++to_clip;
419     assert(to_clip < ref_.length());
420 
421     ref_.resize(ref_.length() - to_clip);
422 
423     for (SAlnRead& r : reads_) {
424         if (r.cigar.empty() || r.cigar.back().first != 'M' || r.cigar.back().second <= to_clip)
425             DOES_NOT_HAPPEN;
426         r.seq.resize(ref_.length());
427         r.cigar.back().second -= to_clip;
428     }
429 }
430 
431 void
sort_by_read_name()432 CLocAlnSet::sort_by_read_name()
433 {
434     sort(this->reads_.begin(), this->reads_.end(),
435         [] (const SAlnRead& r1, const SAlnRead& r2) { return r1.name < r2.name; }
436         );
437 
438     for (auto& s : reads_per_sample_)
439         s.clear();
440     for (size_t i=0; i<reads_.size(); ++i)
441         reads_per_sample_[reads_[i].sample].push_back(i);
442 }
443 
444 void
sort_by_alignment_offset()445 CLocAlnSet::sort_by_alignment_offset()
446 {
447     auto aln_offset = [] (const Cigar& c) ->size_t {
448         assert(!c.empty());
449         assert(cigar_is_MDI(c));
450         if (c[0].first == 'D') {
451             return c[0].second;
452         } else if (c[0].first == 'I'
453             && c.size() >= 2
454             && c[1].first == 'D')
455         {
456             return c[1].second;
457         } else {
458             return 0;
459         }
460     };
461 
462     sort(this->reads_.begin(), this->reads_.end(),
463         [&aln_offset] (const SAlnRead& r1, const SAlnRead& r2) {
464             return aln_offset(r1.cigar) < aln_offset(r2.cigar);
465         });
466 
467     for (auto& s : reads_per_sample_)
468         s.clear();
469     for (size_t i=0; i<reads_.size(); ++i)
470         reads_per_sample_[reads_[i].sample].push_back(i);
471 }
472 
473 void
merge_paired_reads()474 CLocAlnSet::merge_paired_reads()
475 {
476     //
477     // Sort reads by name. Paired reads should have the same name but end with
478     // respectively "/1" and "/2".
479     //
480     sort_by_read_name();
481 
482     // Merge paired reads.
483     for (auto r1 = this->reads_.begin(); r1 != this->reads_.end(); ++r1) {
484         auto  r2 = r1;
485         ++r2;
486 
487         if (r2 == this->reads_.end())
488             break;
489 
490         const string& n1 = r1->name;
491         const string& n2 = r2->name;
492         const size_t   l = n1.length();
493 
494         if (n2.length() == l && l >= 2 &&
495             n1[l-2] == '/' && n1[l-1] == '1' &&
496             n2[l-2] == '/' && n2[l-1] == '2' &&
497             n1.compare(0, l-2, n2, 0, l-2) == 0) {
498 
499             // r1 and r2 are paired, merge them.
500             //assert(r1->sample == r2->sample); // xxx Fix process_radtags.
501             if (r1->sample == r2->sample)
502                 *r1 = SAlnRead(AlnRead::merger_of(move(*r1), move(*r2)), r1->sample);
503 
504             assert(cigar_length_ref(r1->cigar) == ref_.length());
505 
506             // Mark r2 for removal and skip it.
507             r2->seq.clear();
508             ++r1;
509             if (r1 == this->reads_.end())
510                 break;
511         }
512     }
513 
514     // Remove emptied reads.
515     reads_.erase(std::remove_if(reads_.begin(),
516                                 reads_.end(),
517                                 [](const Read& r) { return r.seq.empty(); }
518                                 ),
519                  reads_.end());
520 
521     // Refresh `reads_per_sample_`.
522     reads_per_sample_ = vector<vector<size_t>>(mpopi().samples().size());
523     for (size_t i=0; i<reads_.size(); ++i)
524         reads_per_sample_[reads_[i].sample].push_back(i);
525 }
526 
operator <<(ostream & os,const CLocAlnSet & loc)527 ostream& operator<< (ostream& os, const CLocAlnSet& loc) {
528     os << "ref\t.\t" << loc.ref().str();
529     for (auto& r : loc.reads_)
530         os << "\n" << r.name << "\t" << loc.mpopi().samples()[r.sample].name << "\t" << r.aln();
531     return os;
532 }
533 
534 void
remove_unmerged_reads(ostream * log)535 CLocAlnSet::remove_unmerged_reads(ostream* log)
536 {
537     if (log != NULL)
538         *log << "BEGIN unpaired_reads\n";
539 
540     for (SAlnRead& r : reads_) {
541         if (r.name.back() != 'm') {
542             r.seq.clear();
543             if (log != NULL)
544                 *log << "rm_unpaired\t" << r.name << '\n';
545         }
546     }
547 
548     // Remove emptied reads.
549     stacks_erase_if(reads_, [](const Read& r){return r.seq.empty();} );
550 
551     // Refresh `reads_per_sample_`.
552     reads_per_sample_ = vector<vector<size_t>>(mpopi().samples().size());
553     for (size_t i=0; i<reads_.size(); ++i)
554         reads_per_sample_[reads_[i].sample].push_back(i);
555 
556     if (log != NULL)
557         *log << "END unpaired_reads\n";
558 }
559 
560 void
remove_pcr_duplicates(vector<size_t> * clone_size_distrib,ostream * log)561 CLocAlnSet::remove_pcr_duplicates(vector<size_t>* clone_size_distrib, ostream* log)
562 {
563     if (log != NULL)
564         *log << "BEGIN pcr_duplicates\n";
565 
566     //
567     // Sort reads by (I) sample; (II) insert size; (III, for stability) name.
568     //
569     for (SAlnRead& r : this->reads_) {
570         assert(!r.cigar.empty());
571         assert(cigar_is_MDI(r.cigar));
572         // assert(r.cigar.front().first == 'M'); // This is the cutsite. //FIXME:
573         cigar_canonicalize_MDI_order(r.cigar);
574     }
575     auto compute_insert_length = [] (const Cigar& c) -> size_t {
576         // rem. We assume the asserts above.
577         size_t len = cigar_length_ref(c);
578         auto last = c.rbegin();
579         if (last->first == 'I') {
580             len += last->second;
581             ++last;
582         }
583         if (last != c.rend() && last->first == 'D')
584             len -= last->second;
585         return len;
586     };
587     sort(this->reads_.begin(), this->reads_.end(),
588         [&compute_insert_length](const SAlnRead& r1, const SAlnRead& r2) {
589             if (r1.sample != r2.sample)
590                 return r1.sample < r2.sample;
591             size_t i1 = compute_insert_length(r1.cigar);
592             size_t i2 = compute_insert_length(r2.cigar);
593             if (i1 != i2)
594                 return i1 < i2;
595             return r1.name.compare(r2.name) < 0;
596         });
597 
598     // Remove reads that have the same insert length.
599     vector<SAlnRead>::iterator r = this->reads_.begin();
600     assert(r != this->reads_.end());
601     size_t len = compute_insert_length(r->cigar);
602     while (r != this->reads_.end()) {
603         auto group = r;
604         size_t group_len = len;
605         ++r;
606         while (r != this->reads_.end()) {
607             len = compute_insert_length(r->cigar);
608             if (r->sample != group->sample || len != group_len)
609                 break;
610             ++r;
611         }
612         size_t clone_size = r - group;
613         if (clone_size > 1)
614             for (auto r2=group+1; r2!=r; ++r2)
615                 r2->seq.clear();
616         if (clone_size_distrib != NULL) {
617             if(clone_size_distrib->size() < clone_size + 1) {
618                 clone_size_distrib->resize(clone_size + 1);
619             }
620             ++clone_size_distrib->at(clone_size);
621         }
622         if (log != NULL) {
623             *log << "pcr_dupls\t" << mpopi().samples()[group->sample].name
624                     << '\t';
625             for (auto r2=group; r2!=r; ++r2)
626                 *log << ',' << r2->name;;
627             *log << '\t' << group_len << '\n';
628         }
629     }
630 
631     // Remove emptied reads.
632     stacks_erase_if(reads_, [](const Read& r){return r.seq.empty();} );
633 
634     // Refresh `reads_per_sample_`.
635     reads_per_sample_ = vector<vector<size_t>>(mpopi().samples().size());
636     for (size_t i=0; i<reads_.size(); ++i)
637         reads_per_sample_[reads_[i].sample].push_back(i);
638 
639     if (log != NULL)
640         *log << "END pcr_duplicates\n";
641 }
642 
643 CLocAlnSet
juxtapose(CLocAlnSet && left,CLocAlnSet && right,long offset)644 CLocAlnSet::juxtapose(CLocAlnSet&& left, CLocAlnSet&& right, long offset)
645 {
646     assert(left.id() == right.id());
647     assert(left.pos() == right.pos());
648     assert(&left.mpopi() == &right.mpopi());
649 
650     size_t left_ref_len = left.ref().length();
651     CLocAlnSet merged (move(left));
652 
653     //
654     // N.B. Oct. 2017:
655     //
656     // It actually possible that
657     // ``` size_t(-offset) > left.ref().length() ```
658     // happens legitimately: if inserts smaller than the read size have been
659     // sequenced, the paired-end contig may end upstream of the forward reads,
660     // in the barcode/adapter region. In this case, we don't do anything special,
661     // as with the current approach the left of the paired-end contig (the
662     // `right` matrix) is trimmed anyway.
663     //
664     // Similarly, it is possible that
665     // ``` size_t(-offset) > right.ref().length() ```
666     // happens legitimately, for the same reason. In this case, we just move the
667     // paired-end reads to the FW object, soft-clipping them entirely.
668     //
669     // At the moment, we don't do any special trimming of the small-insert reads.
670     //
671     if (offset < 0 && size_t(-offset) > right.ref().length()) {
672         for (SAlnRead& r : right.reads_) {
673             r.cigar.assign({{'D', merged.ref().length()}, {'I', r.seq.length()}});
674             merged.add(move(r));
675         }
676         left.clear();
677         right.clear();
678         return merged;
679     }
680 
681     // Extend the reference sequence.
682     if (offset >= 0) {
683         for (long i=0; i<offset; ++i)
684             merged.ref_.push_back(Nt4::n);
685         merged.ref_.append(right.ref().begin(), right.ref().end());
686     } else {
687         auto right_itr = right.ref().begin();
688         for (long i=0; i<-offset; ++i) {
689             assert(right_itr != right.ref().end());
690             ++right_itr;
691         }
692         merged.ref_.append(right_itr, right.ref().end());
693     }
694 
695     // Extend the left reads.
696     for (SAlnRead& r : merged.reads_) {
697         cigar_extend_right(r.cigar, right.ref().length() + offset);
698         assert(cigar_length_ref(r.cigar) == merged.ref().length());
699     }
700 
701     // Extend/Trim & add the right reads.
702     for (SAlnRead& r : right.reads_) {
703         if (offset >= 0) {
704            cigar_extend_left(r.cigar, left_ref_len + offset);
705         } else {
706             cigar_trim_left(r.cigar, -offset);
707             assert(cigar_length_query(r.cigar) == r.seq.length());
708             cigar_extend_left(r.cigar, left_ref_len);
709         }
710         merged.add(move(r));
711     }
712     right.reads_ = vector<SAlnRead>();
713     right.reads_per_sample_ = vector<vector<size_t>>(right.mpopi().samples().size());
714 
715     left.clear();
716     right.clear();
717     return merged;
718 }
719