1 #ifndef DNASEQ4_H
2 #define DNASEQ4_H
3 
4 #include "constants.h"
5 #include "nucleotides.h"
6 
7 // A sequence of nucleotides on a uint64_t, where the first nucleotide uses the
8 // low bits.
9 template<class Nt>
10 class NtArray {
11     uint64_t a_;
12 
13 public:
NtArray()14     NtArray() : a_(0) {}
NtArray(uint64_t x)15     NtArray(uint64_t x) : a_(x) {}
NtArray(const NtArray<Nt> & other)16     NtArray(const NtArray<Nt>& other) : a_(other.a_) {}
17     NtArray<Nt>& operator= (const NtArray<Nt>& other) {a_ = other.a_; return *this;}
18 
set(size_t i,Nt nt)19     void set(size_t i, Nt nt) {a_ |= uint64_t(size_t(nt)) << (i*Nt::nbits);}
clear(size_t i)20     void clear(size_t i) {a_ &= ~(lowbits << (i*Nt::nbits));}
21 
22     Nt operator[] (size_t i) const {return Nt(size_t((a_ >> (i*Nt::nbits)) & lowbits));}
23     bool operator== (const NtArray<Nt>& other) const {return a_ == other.a_;}
24     bool operator<  (const NtArray<Nt>& other) const {return a_ < other.a_;}
25     friend struct std::hash<NtArray>;
26 
27     // Methods for kmers
28     void push_front(Nt nt) {a_ <<= Nt::nbits; a_ |= uint64_t(size_t(nt));}
29     void pop_front() {a_ >>= Nt::nbits;}
30 
31     static const size_t n_nts = sizeof a_ * 8 / Nt::nbits;
32 
33 private:
34     static const uint64_t lowbits = (1 << Nt::nbits) - 1;
35 };
36 
37 // A dinucleotide coded on one byte, where the first nucleotide uses the high bits.
38 // This is compatible with BAM/HTSLIB.
39 class DiNuc {
40     uchar x_;
41 
42 public:
43     DiNuc() : x_(0) {}
44     DiNuc(const DiNuc& other) : x_(other.x_) {}
45     DiNuc(uchar x) : x_(x) {}
46     DiNuc(Nt4 nt1, Nt4 nt2) : x_(0) {set_first(nt1); set_second(nt2);}
47     DiNuc(char nt1, char nt2) : DiNuc(Nt4(nt1), Nt4(nt2)) {}
48     DiNuc& operator= (const DiNuc& other) {x_ = other.x_; return *this;}
49 
50     Nt4 first() const {return Nt4(size_t(x_ >>4));}
51     Nt4 second() const {return Nt4(size_t(x_ & 15));}
52     void first(Nt4 nt) {clear_first(); set_first(nt);}
53     void second(Nt4 nt) {clear_second(); set_second(nt);}
54 
55     bool operator== (const DiNuc& other) const {return x_ == other.x_;}
56     bool operator<  (const DiNuc& other) const {return x_ < other.x_;}
57 
58 private:
59     void set_first(Nt4 nt) {x_ |= uchar(size_t(nt)) <<4;}
60     void set_second(Nt4 nt) {x_ |= uchar(size_t(nt));}
61 
62     void clear_first() {x_ &= 15;}
63     void clear_second() {x_ &= ~15;}
64 
65 public:
66     uchar x() const {return x_;} // c.f. hash<DNASeq4>
67 };
68 
69 // A sequence of 4-bits A, C, G, T and N.
70 // Uses DiNuc and is thus compatible with BAM/HTSLIB.
71 class DNASeq4 {
72     size_t l_;
73     vector<DiNuc> v_;
74 
75 public:
76     class iterator;
77 
78     DNASeq4() : l_(0), v_() {}
79     explicit DNASeq4(const DNASeq4& other) : l_(other.l_), v_(other.v_) {}
80     DNASeq4(DNASeq4&& other) noexcept : l_(other.l_), v_(move(other.v_)) {other.clear();}
81     DNASeq4(const char* s, size_t len);
82     DNASeq4(const uchar* arr, size_t len) : l_(len), v_(arr, arr+len/2+len%2) {} // `len` is the length of the sequence (not of the array)
83     DNASeq4(const string& s) : DNASeq4(s.c_str(), s.size()) {}
84     DNASeq4(size_t len) : l_(len), v_(l_/2+l_%2, DiNuc(Nt4::n, Nt4::n)) {if(l_%2) v_.back().second(Nt4::$);}
85     DNASeq4& operator= (const DNASeq4& other) {l_ = other.l_; v_ = other.v_; return *this;}
86     DNASeq4& operator= (DNASeq4&& other) {l_ = other.l_; v_ = move(other.v_); other.clear(); return *this;}
87 
88     size_t length() const {return l_;}
89     bool empty() const {return l_ == 0;}
90     void set(size_t i, Nt4 nt) {i%2==0 ? v_[i/2].first(nt) : v_[i/2].second(nt);}
91     void clear() {l_ = 0; v_.clear();}
92     void resize(size_t len);
93     void reserve(size_t len) {v_.reserve(len/2+len%2);}
94     void push_back(Nt4 nt) {++l_; if (l_%2) v_.push_back(DiNuc(nt,Nt4::$)); else v_.back().second(nt);}
95     void append(iterator first, iterator past);
96 
97     DNASeq4 rev_compl() const;
98     string str() const;
99 
100     void remove_Ns();
101     void shift_Ns_towards_the_end();
102 
103     Nt4 operator[] (size_t i) const {return i%2==0 ? v_[i/2].first() : v_[i/2].second();}
104     bool  operator== (const DNASeq4& other) const {return l_ == other.l_ && v_ == other.v_;}
105     bool  operator<  (const DNASeq4& other) const {return l_ < other.l_ ? true : v_ < other.v_;}
106     friend struct std::hash<DNASeq4>;
107     friend ostream& operator<< (ostream& os, const DNASeq4& seq) {for (Nt4 nt : seq) os << nt; return os;}
108 
109     // Iterator.
110     class iterator {
111         vector<DiNuc>::const_iterator vi_;
112         bool first_;
113 
114     public:
115         iterator(vector<DiNuc>::const_iterator vi, bool f) : vi_(vi), first_(f) {}
116         bool operator!= (iterator other) const {return ! (vi_ == other.vi_? first_ == other.first_ : false);}
117         bool operator== (iterator other) const {return !operator!=(other);}
118         iterator& operator++ () {if (first_) {first_ = false;} else {++vi_; first_ = true;} return *this; }
119         iterator& operator-- () {if (first_) {--vi_; first_ = false;} else {first_ = true;} return *this; }
120         size_t operator- (iterator other) {return 2*(vi_ - other.vi_) + size_t(other.first_) - size_t(first_);}
121 
122         // Get the (Nt4) nucleotide.
123         Nt4 operator* () const {return first_ ? vi_->first() : vi_->second();}
124     };
125     iterator begin() const {return iterator(v_.begin(), true);}
126     iterator end()   const {return length()%2==0 ? iterator(v_.end(), true) : iterator(--v_.end(), false);}
127 
128     // Methods to allow to memcpy into a htslib bam1_t.
129     size_t nbytes() const {return v_.size() * sizeof (DiNuc);}
130     const uchar* vdata() const {return (uchar*) v_.data();}
131 };
132 
133 //
134 // Inline definitions.
135 // ==========
136 //
137 
138 namespace std { template<class Nt>
139 struct hash<NtArray<Nt>> { size_t operator() (const NtArray<Nt>& a) const {
140     return hash<uint64_t>()(a.a_);
141 }};}
142 
143 namespace std {
144 template<>
145 struct hash<DNASeq4> {
146     size_t operator() (const DNASeq4& s) const {
147         size_t x = static_cast<size_t>(14695981039346656037ULL);
148         for (const DiNuc& d : s.v_) {
149             x ^= static_cast<size_t>(d.x());
150             x *= static_cast<size_t>(1099511628211ULL);
151         }
152         return x;
153     }
154 };
155 }
156 
157 #endif //DNASEQ4_h
158