1 #ifndef ABYSS_ROLLING_HASH_H 2 #define ABYSS_ROLLING_HASH_H 1 3 4 #include "config.h" 5 6 #include "BloomDBG/LightweightKmer.h" 7 #include "BloomDBG/MaskedKmer.h" 8 #include "Common/Sense.h" 9 #include "vendor/nthash/nthash.hpp" 10 11 #include <algorithm> 12 #include <string> 13 #include <vector> 14 #include <cassert> 15 #include <boost/dynamic_bitset.hpp> 16 #include <cstring> 17 18 class RollingHash 19 { 20 private: 21 22 typedef uint64_t hash_t; 23 24 /** 25 * Determine the canonical hash value, given hash values for 26 * forward and reverse-complement of the same k-mer. 27 */ canonicalHash(hash_t hash,hash_t rcHash)28 hash_t canonicalHash(hash_t hash, hash_t rcHash) const 29 { 30 return (rcHash < hash) ? rcHash : hash; 31 } 32 33 public: 34 35 /** 36 * Default constructor. 37 */ RollingHash()38 RollingHash() : m_numHashes(0), m_k(0), m_hash1(0), m_rcHash1(0) {} 39 40 /** 41 * Constructor. Construct RollingHash object when initial k-mer 42 * is unknown. 43 * @param numHashes number of pseudo-independent hash values to compute 44 * for each k-mer 45 * @param k k-mer length 46 */ RollingHash(unsigned numHashes,unsigned k)47 RollingHash(unsigned numHashes, unsigned k) : m_numHashes(numHashes), 48 m_k(k), m_hash1(0), m_rcHash1(0) {} 49 50 /** 51 * Constructor. Construct RollingHash object while specifying 52 * initial k-mer to be hashed. 53 * @param kmer initial k-mer for initializing hash value(s) 54 * @param numHashes number of pseudo-independent hash values to compute 55 * for each k-mer 56 * @param k k-mer length 57 */ RollingHash(const std::string & kmer,unsigned numHashes,unsigned k)58 RollingHash(const std::string& kmer, unsigned numHashes, unsigned k) 59 : m_numHashes(numHashes), m_k(k), m_hash1(0), m_rcHash1(0) 60 { 61 /* init rolling hash state */ 62 reset(kmer); 63 } 64 65 /** 66 * Initialize hash state from sequence. 67 * @param kmer k-mer used to initialize hash state 68 */ reset(const std::string & kmer)69 void reset(const std::string& kmer) 70 { 71 /* compute initial hash values for forward and reverse-complement k-mer */ 72 NTC64(kmer.c_str(), m_k, m_hash1, m_rcHash1); 73 74 /* get canonical hash value from forward/reverse hash values */ 75 m_hash = canonicalHash(m_hash1, m_rcHash1); 76 77 if (!MaskedKmer::mask().empty()) 78 m_hash = maskHash(m_hash1, m_rcHash1, MaskedKmer::mask().c_str(), 79 kmer.c_str(), m_k); 80 } 81 82 /** 83 * Compute hash values for next k-mer to the right and 84 * update internal state. 85 * @param kmer current k-mer 86 * @param nextKmer k-mer we are rolling into 87 */ rollRight(const char * kmer,char charIn)88 void rollRight(const char* kmer, char charIn) 89 { 90 NTC64(kmer[0], charIn, m_k, m_hash1, m_rcHash1); 91 m_hash = canonicalHash(m_hash1, m_rcHash1); 92 93 if (!MaskedKmer::mask().empty()) { 94 // TODO: copying the k-mer and shifting is very inefficient; 95 // we need a specialized nthash function that rolls and masks 96 // simultaneously 97 LightweightKmer next(kmer); 98 next.shift(SENSE, charIn); 99 m_hash = maskHash(m_hash1, m_rcHash1, MaskedKmer::mask().c_str(), 100 next.c_str(), m_k); 101 } 102 } 103 104 /** 105 * Compute hash values for next k-mer to the left and 106 * update internal state. 107 * @param prevKmer k-mer we are rolling into 108 * @param kmer current k-mer 109 */ rollLeft(char charIn,const char * kmer)110 void rollLeft(char charIn, const char* kmer) 111 { 112 NTC64L(kmer[m_k-1], charIn, m_k, m_hash1, m_rcHash1); 113 m_hash = canonicalHash(m_hash1, m_rcHash1); 114 115 if (!MaskedKmer::mask().empty()) { 116 // TODO: copying the k-mer and shifting is very inefficient; 117 // we need a specialized nthash function that rolls and masks 118 // simultaneously 119 LightweightKmer next(kmer); 120 next.shift(ANTISENSE, charIn); 121 m_hash = maskHash(m_hash1, m_rcHash1, MaskedKmer::mask().c_str(), 122 next.c_str(), m_k); 123 } 124 } 125 126 /** 127 * Get the seed hash value for the current k-mer. The seed hash 128 * value is used to calculate multiple pseudo-independant 129 * hash functions. 130 */ getHashSeed()131 size_t getHashSeed() const 132 { 133 return (size_t)m_hash; 134 } 135 136 /** 137 * Get hash values for current k-mer. 138 * 139 * @param hashes array for returned hash values 140 */ getHashes(hash_t hashes[])141 void getHashes(hash_t hashes[]) const 142 { 143 hashes[0] = m_hash; 144 for (unsigned i = 1; i < m_numHashes; ++i) 145 hashes[i] = NTE64(m_hash, m_k, i); 146 } 147 148 /** Equality operator */ 149 bool operator==(const RollingHash& o) const 150 { 151 /** 152 * Note: If hash seeds are equal, then the values 153 * for all hash functions will also be equal, since 154 * the hash values are calculated from the 155 * seed in a deterministic manner. In practice seed 156 * collision is very unlikely, though! 157 */ 158 return m_k == o.m_k && getHashSeed() == o.getHashSeed(); 159 } 160 161 /** Inequality operator */ 162 bool operator!=(const RollingHash& o) const 163 { 164 return !(*this == o); 165 } 166 167 /** 168 * Change the hash value to reflect a change in the first/last base of 169 * the k-mer. 170 * @param kmer point to the k-mer char array 171 * @param dir if SENSE, change last base; if ANTISENSE, 172 * change first base 173 * @param base new value for the base 174 */ setLastBase(char * kmer,extDirection dir,char base)175 void setLastBase(char* kmer, extDirection dir, char base) 176 { 177 if (dir == SENSE) { 178 /* roll left to remove old last char */ 179 NTC64L(kmer[m_k-1], 'A', m_k, m_hash1, m_rcHash1); 180 /* roll right to add new last char */ 181 NTC64('A', base, m_k, m_hash1, m_rcHash1); 182 } else { 183 /* roll right to remove old first char */ 184 NTC64(kmer[0], 'A', m_k, m_hash1, m_rcHash1); 185 /* roll left to add new first char */ 186 NTC64L('A', base, m_k, m_hash1, m_rcHash1); 187 } 188 m_hash = canonicalHash(m_hash1, m_rcHash1); 189 190 if (!MaskedKmer::mask().empty()) 191 m_hash = maskHash(m_hash1, m_rcHash1, MaskedKmer::mask().c_str(), 192 kmer, m_k); 193 } 194 195 /** 196 * Reverse complement the hash state, so that rolling right becomes 197 * rolling left, and vice versa. This operation is needed 198 * whenever we reverse-complement a k-mer that has an associated 199 * `RollingHash` state, so that subsequent rolling operations will 200 * produce the correct hash value. 201 */ reverseComplement()202 void reverseComplement() 203 { 204 std::swap(m_hash1, m_rcHash1); 205 } 206 207 private: 208 209 /** number of hash functions */ 210 unsigned m_numHashes; 211 /** k-mer length */ 212 unsigned m_k; 213 /** value of first hash function for current k-mer */ 214 hash_t m_hash1; 215 /** value of first hash function for current k-mer, after 216 * reverse-complementing */ 217 hash_t m_rcHash1; 218 /** current canonical hash value */ 219 hash_t m_hash; 220 }; 221 222 #endif 223