1 #ifndef ABYSS_ROLLING_HASH_H
2 #define ABYSS_ROLLING_HASH_H 1
3 
4 #include "config.h"
5 
6 #include "BloomDBG/LightweightKmer.h"
7 #include "BloomDBG/MaskedKmer.h"
8 #include "Common/Sense.h"
9 #include "vendor/nthash/nthash.hpp"
10 
11 #include <algorithm>
12 #include <string>
13 #include <vector>
14 #include <cassert>
15 #include <boost/dynamic_bitset.hpp>
16 #include <cstring>
17 
18 class RollingHash
19 {
20 private:
21 
22 	typedef uint64_t hash_t;
23 
24 	/**
25 	 * Determine the canonical hash value, given hash values for
26 	 * forward and reverse-complement of the same k-mer.
27 	 */
canonicalHash(hash_t hash,hash_t rcHash)28 	hash_t canonicalHash(hash_t hash, hash_t rcHash) const
29 	{
30 		return (rcHash < hash) ? rcHash : hash;
31 	}
32 
33 public:
34 
35 	/**
36 	 * Default constructor.
37 	 */
RollingHash()38 	RollingHash() : m_numHashes(0), m_k(0), m_hash1(0), m_rcHash1(0) {}
39 
40 	/**
41 	 * Constructor. Construct RollingHash object when initial k-mer
42 	 * is unknown.
43 	 * @param numHashes number of pseudo-independent hash values to compute
44 	 * for each k-mer
45 	 * @param k k-mer length
46 	 */
RollingHash(unsigned numHashes,unsigned k)47 	RollingHash(unsigned numHashes, unsigned k) : m_numHashes(numHashes),
48 		m_k(k), m_hash1(0), m_rcHash1(0) {}
49 
50 	/**
51 	 * Constructor. Construct RollingHash object while specifying
52 	 * initial k-mer to be hashed.
53 	 * @param kmer initial k-mer for initializing hash value(s)
54 	 * @param numHashes number of pseudo-independent hash values to compute
55 	 * for each k-mer
56 	 * @param k k-mer length
57 	 */
RollingHash(const std::string & kmer,unsigned numHashes,unsigned k)58 	RollingHash(const std::string& kmer, unsigned numHashes, unsigned k)
59 		: m_numHashes(numHashes), m_k(k), m_hash1(0), m_rcHash1(0)
60 	{
61 		/* init rolling hash state */
62 		reset(kmer);
63 	}
64 
65 	/**
66 	 * Initialize hash state from sequence.
67 	 * @param kmer k-mer used to initialize hash state
68 	 */
reset(const std::string & kmer)69 	void reset(const std::string& kmer)
70 	{
71 		/* compute initial hash values for forward and reverse-complement k-mer */
72 		NTC64(kmer.c_str(), m_k, m_hash1, m_rcHash1);
73 
74 		/* get canonical hash value from forward/reverse hash values */
75 		m_hash = canonicalHash(m_hash1, m_rcHash1);
76 
77 		if (!MaskedKmer::mask().empty())
78 			m_hash = maskHash(m_hash1, m_rcHash1, MaskedKmer::mask().c_str(),
79 				kmer.c_str(), m_k);
80 	}
81 
82 	/**
83 	 * Compute hash values for next k-mer to the right and
84 	 * update internal state.
85 	 * @param kmer current k-mer
86 	 * @param nextKmer k-mer we are rolling into
87 	 */
rollRight(const char * kmer,char charIn)88 	void rollRight(const char* kmer, char charIn)
89 	{
90 		NTC64(kmer[0], charIn, m_k, m_hash1, m_rcHash1);
91 		m_hash = canonicalHash(m_hash1, m_rcHash1);
92 
93 		if (!MaskedKmer::mask().empty()) {
94 			// TODO: copying the k-mer and shifting is very inefficient;
95 			// we need a specialized nthash function that rolls and masks
96 			// simultaneously
97 			LightweightKmer next(kmer);
98 			next.shift(SENSE, charIn);
99 			m_hash = maskHash(m_hash1, m_rcHash1, MaskedKmer::mask().c_str(),
100 				next.c_str(), m_k);
101 		}
102 	}
103 
104 	/**
105 	 * Compute hash values for next k-mer to the left and
106 	 * update internal state.
107 	 * @param prevKmer k-mer we are rolling into
108 	 * @param kmer current k-mer
109 	 */
rollLeft(char charIn,const char * kmer)110 	void rollLeft(char charIn, const char* kmer)
111 	{
112 		NTC64L(kmer[m_k-1], charIn, m_k, m_hash1, m_rcHash1);
113 		m_hash = canonicalHash(m_hash1, m_rcHash1);
114 
115 		if (!MaskedKmer::mask().empty()) {
116 			// TODO: copying the k-mer and shifting is very inefficient;
117 			// we need a specialized nthash function that rolls and masks
118 			// simultaneously
119 			LightweightKmer next(kmer);
120 			next.shift(ANTISENSE, charIn);
121 			m_hash = maskHash(m_hash1, m_rcHash1, MaskedKmer::mask().c_str(),
122 				next.c_str(), m_k);
123 		}
124 	}
125 
126 	/**
127 	 * Get the seed hash value for the current k-mer. The seed hash
128 	 * value is used to calculate multiple pseudo-independant
129 	 * hash functions.
130 	 */
getHashSeed()131 	size_t getHashSeed() const
132 	{
133 		return (size_t)m_hash;
134 	}
135 
136 	/**
137 	 * Get hash values for current k-mer.
138 	 *
139 	 * @param hashes array for returned hash values
140 	 */
getHashes(hash_t hashes[])141 	void getHashes(hash_t hashes[]) const
142 	{
143 		hashes[0] = m_hash;
144 		for (unsigned i = 1; i < m_numHashes; ++i)
145 			hashes[i] = NTE64(m_hash, m_k, i);
146 	}
147 
148 	/** Equality operator */
149 	bool operator==(const RollingHash& o) const
150 	{
151 		/**
152 		 * Note: If hash seeds are equal, then the values
153 		 * for all hash functions will also be equal, since
154 		 * the hash values are calculated from the
155 		 * seed in a deterministic manner. In practice seed
156 		 * collision is very unlikely, though!
157 		 */
158 		return m_k == o.m_k && getHashSeed() == o.getHashSeed();
159 	}
160 
161 	/** Inequality operator */
162 	bool operator!=(const RollingHash& o) const
163 	{
164 		return !(*this == o);
165 	}
166 
167 	/**
168 	 * Change the hash value to reflect a change in the first/last base of
169 	 * the k-mer.
170 	 * @param kmer point to the k-mer char array
171 	 * @param dir if SENSE, change last base; if ANTISENSE,
172 	 * change first base
173 	 * @param base new value for the base
174 	 */
setLastBase(char * kmer,extDirection dir,char base)175 	void setLastBase(char* kmer, extDirection dir, char base)
176 	{
177 		if (dir == SENSE) {
178 			/* roll left to remove old last char */
179 			NTC64L(kmer[m_k-1], 'A', m_k, m_hash1, m_rcHash1);
180 			/* roll right to add new last char */
181 			NTC64('A', base, m_k, m_hash1, m_rcHash1);
182 		} else {
183 			/* roll right to remove old first char */
184 			NTC64(kmer[0], 'A', m_k, m_hash1, m_rcHash1);
185 			/* roll left to add new first char */
186 			NTC64L('A', base, m_k, m_hash1, m_rcHash1);
187 		}
188 		m_hash = canonicalHash(m_hash1, m_rcHash1);
189 
190 		if (!MaskedKmer::mask().empty())
191 			m_hash = maskHash(m_hash1, m_rcHash1, MaskedKmer::mask().c_str(),
192 				kmer, m_k);
193 	}
194 
195 	/**
196 	 * Reverse complement the hash state, so that rolling right becomes
197 	 * rolling left, and vice versa.  This operation is needed
198 	 * whenever we reverse-complement a k-mer that has an associated
199 	 * `RollingHash` state, so that subsequent rolling operations will
200 	 * produce the correct hash value.
201 	 */
reverseComplement()202 	void reverseComplement()
203 	{
204 		std::swap(m_hash1, m_rcHash1);
205 	}
206 
207 private:
208 
209 	/** number of hash functions */
210 	unsigned m_numHashes;
211 	/** k-mer length */
212 	unsigned m_k;
213 	/** value of first hash function for current k-mer */
214 	hash_t m_hash1;
215 	/** value of first hash function for current k-mer, after
216 	 * reverse-complementing */
217 	hash_t m_rcHash1;
218 	/** current canonical hash value */
219 	hash_t m_hash;
220 };
221 
222 #endif
223