1 #ifndef DNASEQ4_H 2 #define DNASEQ4_H 3 4 #include "constants.h" 5 #include "nucleotides.h" 6 7 // A sequence of nucleotides on a uint64_t, where the first nucleotide uses the 8 // low bits. 9 template<class Nt> 10 class NtArray { 11 uint64_t a_; 12 13 public: NtArray()14 NtArray() : a_(0) {} NtArray(uint64_t x)15 NtArray(uint64_t x) : a_(x) {} NtArray(const NtArray<Nt> & other)16 NtArray(const NtArray<Nt>& other) : a_(other.a_) {} 17 NtArray<Nt>& operator= (const NtArray<Nt>& other) {a_ = other.a_; return *this;} 18 set(size_t i,Nt nt)19 void set(size_t i, Nt nt) {a_ |= uint64_t(size_t(nt)) << (i*Nt::nbits);} clear(size_t i)20 void clear(size_t i) {a_ &= ~(lowbits << (i*Nt::nbits));} 21 22 Nt operator[] (size_t i) const {return Nt(size_t((a_ >> (i*Nt::nbits)) & lowbits));} 23 bool operator== (const NtArray<Nt>& other) const {return a_ == other.a_;} 24 bool operator< (const NtArray<Nt>& other) const {return a_ < other.a_;} 25 friend struct std::hash<NtArray>; 26 27 // Methods for kmers 28 void push_front(Nt nt) {a_ <<= Nt::nbits; a_ |= uint64_t(size_t(nt));} 29 void pop_front() {a_ >>= Nt::nbits;} 30 31 static const size_t n_nts = sizeof a_ * 8 / Nt::nbits; 32 33 private: 34 static const uint64_t lowbits = (1 << Nt::nbits) - 1; 35 }; 36 37 // A dinucleotide coded on one byte, where the first nucleotide uses the high bits. 38 // This is compatible with BAM/HTSLIB. 39 class DiNuc { 40 uchar x_; 41 42 public: 43 DiNuc() : x_(0) {} 44 DiNuc(const DiNuc& other) : x_(other.x_) {} 45 DiNuc(uchar x) : x_(x) {} 46 DiNuc(Nt4 nt1, Nt4 nt2) : x_(0) {set_first(nt1); set_second(nt2);} 47 DiNuc(char nt1, char nt2) : DiNuc(Nt4(nt1), Nt4(nt2)) {} 48 DiNuc& operator= (const DiNuc& other) {x_ = other.x_; return *this;} 49 50 Nt4 first() const {return Nt4(size_t(x_ >>4));} 51 Nt4 second() const {return Nt4(size_t(x_ & 15));} 52 void first(Nt4 nt) {clear_first(); set_first(nt);} 53 void second(Nt4 nt) {clear_second(); set_second(nt);} 54 55 bool operator== (const DiNuc& other) const {return x_ == other.x_;} 56 bool operator< (const DiNuc& other) const {return x_ < other.x_;} 57 58 private: 59 void set_first(Nt4 nt) {x_ |= uchar(size_t(nt)) <<4;} 60 void set_second(Nt4 nt) {x_ |= uchar(size_t(nt));} 61 62 void clear_first() {x_ &= 15;} 63 void clear_second() {x_ &= ~15;} 64 65 public: 66 uchar x() const {return x_;} // c.f. hash<DNASeq4> 67 }; 68 69 // A sequence of 4-bits A, C, G, T and N. 70 // Uses DiNuc and is thus compatible with BAM/HTSLIB. 71 class DNASeq4 { 72 size_t l_; 73 vector<DiNuc> v_; 74 75 public: 76 class iterator; 77 78 DNASeq4() : l_(0), v_() {} 79 explicit DNASeq4(const DNASeq4& other) : l_(other.l_), v_(other.v_) {} 80 DNASeq4(DNASeq4&& other) noexcept : l_(other.l_), v_(move(other.v_)) {other.clear();} 81 DNASeq4(const char* s, size_t len); 82 DNASeq4(const uchar* arr, size_t len) : l_(len), v_(arr, arr+len/2+len%2) {} // `len` is the length of the sequence (not of the array) 83 DNASeq4(const string& s) : DNASeq4(s.c_str(), s.size()) {} 84 DNASeq4(size_t len) : l_(len), v_(l_/2+l_%2, DiNuc(Nt4::n, Nt4::n)) {if(l_%2) v_.back().second(Nt4::$);} 85 DNASeq4& operator= (const DNASeq4& other) {l_ = other.l_; v_ = other.v_; return *this;} 86 DNASeq4& operator= (DNASeq4&& other) {l_ = other.l_; v_ = move(other.v_); other.clear(); return *this;} 87 88 size_t length() const {return l_;} 89 bool empty() const {return l_ == 0;} 90 void set(size_t i, Nt4 nt) {i%2==0 ? v_[i/2].first(nt) : v_[i/2].second(nt);} 91 void clear() {l_ = 0; v_.clear();} 92 void resize(size_t len); 93 void reserve(size_t len) {v_.reserve(len/2+len%2);} 94 void push_back(Nt4 nt) {++l_; if (l_%2) v_.push_back(DiNuc(nt,Nt4::$)); else v_.back().second(nt);} 95 void append(iterator first, iterator past); 96 97 DNASeq4 rev_compl() const; 98 string str() const; 99 100 void remove_Ns(); 101 void shift_Ns_towards_the_end(); 102 103 Nt4 operator[] (size_t i) const {return i%2==0 ? v_[i/2].first() : v_[i/2].second();} 104 bool operator== (const DNASeq4& other) const {return l_ == other.l_ && v_ == other.v_;} 105 bool operator< (const DNASeq4& other) const {return l_ < other.l_ ? true : v_ < other.v_;} 106 friend struct std::hash<DNASeq4>; 107 friend ostream& operator<< (ostream& os, const DNASeq4& seq) {for (Nt4 nt : seq) os << nt; return os;} 108 109 // Iterator. 110 class iterator { 111 vector<DiNuc>::const_iterator vi_; 112 bool first_; 113 114 public: 115 iterator(vector<DiNuc>::const_iterator vi, bool f) : vi_(vi), first_(f) {} 116 bool operator!= (iterator other) const {return ! (vi_ == other.vi_? first_ == other.first_ : false);} 117 bool operator== (iterator other) const {return !operator!=(other);} 118 iterator& operator++ () {if (first_) {first_ = false;} else {++vi_; first_ = true;} return *this; } 119 iterator& operator-- () {if (first_) {--vi_; first_ = false;} else {first_ = true;} return *this; } 120 size_t operator- (iterator other) {return 2*(vi_ - other.vi_) + size_t(other.first_) - size_t(first_);} 121 122 // Get the (Nt4) nucleotide. 123 Nt4 operator* () const {return first_ ? vi_->first() : vi_->second();} 124 }; 125 iterator begin() const {return iterator(v_.begin(), true);} 126 iterator end() const {return length()%2==0 ? iterator(v_.end(), true) : iterator(--v_.end(), false);} 127 128 // Methods to allow to memcpy into a htslib bam1_t. 129 size_t nbytes() const {return v_.size() * sizeof (DiNuc);} 130 const uchar* vdata() const {return (uchar*) v_.data();} 131 }; 132 133 // 134 // Inline definitions. 135 // ========== 136 // 137 138 namespace std { template<class Nt> 139 struct hash<NtArray<Nt>> { size_t operator() (const NtArray<Nt>& a) const { 140 return hash<uint64_t>()(a.a_); 141 }};} 142 143 namespace std { 144 template<> 145 struct hash<DNASeq4> { 146 size_t operator() (const DNASeq4& s) const { 147 size_t x = static_cast<size_t>(14695981039346656037ULL); 148 for (const DiNuc& d : s.v_) { 149 x ^= static_cast<size_t>(d.x()); 150 x *= static_cast<size_t>(1099511628211ULL); 151 } 152 return x; 153 } 154 }; 155 } 156 157 #endif //DNASEQ4_h 158