1 #ifndef FMINDEX_H
2 #define FMINDEX_H 1
3 
4 #include "config.h"
5 #include "BitArrays.h"
6 #include "IOUtil.h"
7 #include "sais.hxx"
8 #include <boost/integer.hpp>
9 #include <algorithm>
10 #include <cassert>
11 #include <cstdlib> // for exit
12 #include <iostream>
13 #include <iterator>
14 #include <limits> // for numeric_limits
15 #include <stdint.h>
16 #include <string>
17 #include <vector>
18 
19 /** An FM index. */
20 class FMIndex
21 {
22 	/** A symbol. */
23 	typedef uint8_t T;
24 
25 	/** The sentinel symbol. */
SENTINEL()26 	static T SENTINEL() { return std::numeric_limits<T>::max(); }
27 
28   public:
29 	/** An index. */
30 	typedef boost::uint_t<FMBITS>::least size_type;
31 
32 	/** An index for SAIS, which must be signed. */
33 	typedef boost::int_t<FMBITS>::least sais_size_type;
34 
35 	/** The type of a symbol. */
36 	typedef T value_type;
37 
38 /** A suffix array interval. */
39 struct SAInterval
40 {
41 	size_type l, u;
SAIntervalSAInterval42 	SAInterval(size_type l, size_type u) : l(l), u(u) { }
SAIntervalSAInterval43 	SAInterval(const FMIndex& fm) : l(1), u(fm.m_occ.size()) { }
emptySAInterval44 	bool empty() const { return l >= u; }
45 	bool operator==(const SAInterval& x) const
46 	{
47 		return x.l == l && x.u == u;
48 	}
49 
sizeSAInterval50 	size_type size() const
51 	{
52 		assert(l <= u);
53 		return u - l;
54 	}
55 };
56 
57 /** A match of a substring of a query sequence to an FM index. */
58 struct Match : public SAInterval
59 {
60 	unsigned qstart, qend, num;
61 
MatchMatch62 	Match() : SAInterval(0, 0), qstart(0), qend(0), num(0) { }
63 
64 	Match(size_type l, size_type u, unsigned qstart, unsigned qend,
65 			unsigned num = 1)
SAIntervalMatch66 		: SAInterval(l, u), qstart(qstart), qend(qend), num(num) { }
67 
qspanMatch68 	unsigned qspan() const
69 	{
70 		assert(qstart <= qend);
71 		return qend - qstart;
72 	}
73 };
74 
FMIndex()75 FMIndex() : m_sampleSA(1) { }
76 
77 /** Return the size of the string not counting the sentinel. */
size()78 size_t size() const { return m_occ.size() - 1; }
79 
80 /** The size of the alphabet. */
alphabetSize()81 unsigned alphabetSize() const { return m_alphabet.size(); }
82 
83 /** Encode the alphabet of [first, last). */
84 template <typename It>
encode(It first,It last)85 void encode(It first, It last) const
86 {
87 	assert(first < last);
88 	assert(!m_alphabet.empty());
89 	std::transform(first, last, first, Translate(*this));
90 }
91 
92 /** Decode the alphabet of [first, last). */
93 template <typename It>
decode(It first,It last)94 void decode(It first, It last) const
95 {
96 	assert(first < last);
97 	assert(!m_alphabet.empty());
98 	for (It it = first; it < last; ++it)
99 		*it = m_alphabet[*it];
100 }
101 
102 /** Build the BWT of [first, last).
103  * The BWT including the sentinel is stored in [first, last].
104  * @return the position of the sentinel
105  */
106 template<typename It>
buildBWT(It first,It last)107 size_type buildBWT(It first, It last) const
108 {
109 	assert(first < last);
110 	assert(size_t(last - first)
111 			< std::numeric_limits<size_type>::max());
112 	encode(first, last);
113 	std::replace(first, last, SENTINEL(), T(0));
114 
115 	std::cerr << "Building the Burrows-Wheeler transform...\n";
116 	size_t n = last - first;
117 	std::vector<sais_size_type> sa(n);
118 	assert(sizeof (size_type) == sizeof (sais_size_type));
119 	sais_size_type sentinel = saisxx_bwt(first, first,
120 			&sa[0],
121 			(sais_size_type)n,
122 			(sais_size_type)m_alphabet.size());
123 	assert(sentinel >= 0);
124 	if (sentinel < 0)
125 		abort();
126 	// Insert the sentinel.
127 	std::copy_backward(first + sentinel, last, last + 1);
128 	first[sentinel] = SENTINEL();
129 	return sentinel;
130 }
131 
132 /** Set the specified element of the sampled suffix array. */
setSA(size_t sai,size_t pos)133 void setSA(size_t sai, size_t pos)
134 {
135 	if (sai % m_sampleSA == 0) {
136 		size_t i = sai / m_sampleSA;
137 		assert(i < m_sa.size());
138 		m_sa[i] = pos;
139 	}
140 }
141 
142 /** Construct the suffix array from the FM index. */
constructSuffixArray()143 void constructSuffixArray()
144 {
145 	// The length of the original string.
146 	size_t n = m_occ.size() - 1;
147 	assert(n > 0);
148 	assert(m_sampleSA > 0);
149 	m_sa.resize(n / m_sampleSA + 1);
150 	size_t sai = 0;
151 	for (size_t i = n; i > 0; i--) {
152 		setSA(sai, i);
153 		T c = m_occ.at(sai);
154 		assert(c != SENTINEL());
155 		sai = m_cf[c] + m_occ.rank(c, sai);
156 		assert(sai > 0);
157 	}
158 	setSA(sai, 0);
159 }
160 
161 /** Build an FM-index of the specified BWT. */
162 template<typename It>
assignBWT(It first,It last)163 void assignBWT(It first, It last)
164 {
165 	assert(last - first > 1);
166 	assert(size_t(last - first)
167 			< std::numeric_limits<size_type>::max());
168 
169 	std::cerr << "Building the character occurrence table...\n";
170 	m_occ.assign(first, last);
171 	countOccurrences();
172 
173 	// Construct the suffix array from the FM index.
174 	std::cerr << "Building the suffix array...\n";
175 	constructSuffixArray();
176 }
177 
178 /** Build an FM-index of the specified data. */
179 template<typename It>
assign(It first,It last)180 void assign(It first, It last)
181 {
182 	assert(first < last);
183 	assert(size_t(last - first)
184 			< std::numeric_limits<size_type>::max());
185 
186 	encode(first, last);
187 	std::replace(first, last, SENTINEL(), T(0));
188 
189 	// Construct the suffix array.
190 	std::cerr << "Building the suffix array...\n";
191 	size_t n = last - first;
192 	m_sampleSA = 1;
193 	m_sa.resize(n + 1);
194 	m_sa[0] = n;
195 
196 	assert(sizeof (size_type) == sizeof (sais_size_type));
197 	int status = saisxx(first,
198 			reinterpret_cast<sais_size_type*>(&m_sa[1]),
199 			(sais_size_type)n,
200 			(sais_size_type)m_alphabet.size());
201 	assert(status == 0);
202 	if (status != 0)
203 		abort();
204 
205 	// Construct the Burrows-Wheeler transform.
206 	std::vector<T> bwt;
207 	std::cerr << "Building the Burrows-Wheeler transform...\n";
208 	bwt.resize(m_sa.size());
209 	for (size_t i = 0; i < m_sa.size(); i++)
210 		bwt[i] = m_sa[i] == 0 ? SENTINEL() : first[m_sa[i] - 1];
211 
212 	std::cerr << "Building the character occurrence table...\n";
213 	m_occ.assign(bwt.begin(), bwt.end());
214 	countOccurrences();
215 }
216 
217 /** Sample the suffix array. */
sampleSA(unsigned period)218 void sampleSA(unsigned period)
219 {
220 	assert(period > 0);
221 	if (period == m_sampleSA)
222 		return;
223 	assert(m_sampleSA == 1);
224 	m_sampleSA = period;
225 	if (m_sampleSA == 1 || m_sa.empty())
226 		return;
227 	std::vector<size_type>::iterator out = m_sa.begin();
228 	for (size_t i = 0; i < m_sa.size(); i += m_sampleSA)
229 		*out++ = m_sa[i];
230 	m_sa.erase(out, m_sa.end());
231 	assert(!m_sa.empty());
232 }
233 
234 /** Return the specified element of the suffix array. */
at(size_t i)235 size_t at(size_t i) const
236 {
237 	assert(i < m_occ.size());
238 	size_t n = 0;
239 	while (i % m_sampleSA != 0) {
240 		T c = m_occ.at(i);
241 		i = c == SENTINEL() ? 0 : m_cf[c] + m_occ.rank(c, i);
242 		n++;
243 	}
244 	assert(i / m_sampleSA < m_sa.size());
245 	size_t pos = m_sa[i / m_sampleSA] + n;
246 	return pos < m_occ.size() ? pos : pos - m_occ.size();
247 }
248 
249 /** Return the specified element of the suffix array. */
250 size_t operator[](size_t i) const
251 {
252 	return at(i);
253 }
254 
255 /** Return the symbol at the specifed position of the BWT. */
bwtAt(size_t i)256 T bwtAt(size_t i) const
257 {
258 	assert(i < m_occ.size());
259 	T c = m_occ.at(i);
260 	assert(c != SENTINEL());
261 	assert(c < m_alphabet.size());
262 	return m_alphabet[c];
263 }
264 
265 /** Return the first symbol of the specified suffix. */
symbolAt(size_t i)266 T symbolAt(size_t i) const
267 {
268 	assert(i < m_occ.size());
269 	std::vector<size_type>::const_iterator it
270 		= std::upper_bound(m_cf.begin(), m_cf.end(), i);
271 	assert(it != m_cf.begin());
272 	T c = it - m_cf.begin() - 1;
273 	assert(c < m_alphabet.size());
274 	return m_alphabet[c];
275 }
276 
277 /** Decompress the index. */
278 template <typename It>
decompress(It out)279 void decompress(It out)
280 {
281 	// Construct the original string.
282 	std::vector<T> s;
283 	for (size_t i = 0;;) {
284 		assert(i < m_occ.size());
285 		T c = m_occ.at(i);
286 		if (c == SENTINEL())
287 			break;
288 		s.push_back(c);
289 		i = m_cf[c] + m_occ.rank(c, i);
290 		assert(i > 0);
291 	}
292 
293 	// Translate the character set and output the result.
294 	for (std::vector<T>::reverse_iterator it = s.rbegin();
295 			it != s.rend(); ++it) {
296 		T c = *it;
297 		assert(c < m_alphabet.size());
298 		*out++ = m_alphabet[c];
299 	}
300 }
301 
302 /** Extend a suffix array coordinate by one character to the left. */
update(size_type i,T c)303 size_type update(size_type i, T c) const
304 {
305 	assert(c < m_cf.size());
306 	return m_cf[c] + m_occ.rank(c, i);
307 }
308 
309 /** Extend a suffix array interval by one character to the left. */
update(SAInterval sai,T c)310 SAInterval update(SAInterval sai, T c) const
311 {
312 	return SAInterval(update(sai.l, c), update(sai.u, c));
313 }
314 
315 /** Search for an exact match. */
316 template <typename It>
findExact(It first,It last,SAInterval sai)317 SAInterval findExact(It first, It last, SAInterval sai) const
318 {
319 	assert(first < last);
320 	for (It it = last - 1; it >= first && !sai.empty(); --it) {
321 		T c = *it;
322 		if (c == SENTINEL())
323 			return SAInterval(0, 0);
324 		sai = update(sai, c);
325 	}
326 	return sai;
327 }
328 
329 /** Search for an exact match. */
330 template <typename String>
findExact(const String & q)331 SAInterval findExact(const String& q) const
332 {
333 	String s(q.size());
334 	std::transform(q.begin(), q.end(), s.begin(), Translate(*this));
335 	return findExact(s.begin(), s.end());
336 }
337 
338 /** Search for a suffix of the query that matches a prefix of the
339  * target.
340  */
341 template <typename It, typename OutIt>
findOverlapSuffix(It first,It last,OutIt out,unsigned minOverlap)342 OutIt findOverlapSuffix(It first, It last, OutIt out,
343 		unsigned minOverlap) const
344 {
345 	assert(first < last);
346 
347 	SAInterval sai(*this);
348 	for (It it = last - 1; it >= first && !sai.empty(); --it) {
349 		T c = *it;
350 		if (c == SENTINEL())
351 			break;
352 		sai = update(sai, c);
353 		if (sai.empty())
354 			break;
355 
356 		if (unsigned(last - it) < minOverlap)
357 			continue;
358 
359 		SAInterval sai1 = update(sai, 0);
360 		if (!sai1.empty())
361 			*out++ = Match(sai1.l, sai1.u,
362 					it - first, last - first);
363 	}
364 	return out;
365 }
366 
367 /** Search for a suffix of the query that matches a prefix of the
368  * target.
369  */
370 template <typename OutIt>
findOverlapSuffix(const std::string & q,OutIt out,unsigned minOverlap)371 OutIt findOverlapSuffix(const std::string& q, OutIt out,
372 		unsigned minOverlap) const
373 {
374 	std::string s = q;
375 	std::transform(s.begin(), s.end(), s.begin(), Translate(*this));
376 	return findOverlapSuffix(s.begin(), s.end(), out, minOverlap);
377 }
378 
379 /** Search for a prefix of the query that matches a suffix of the
380  * target.
381  */
382 template <typename OutIt>
findOverlapPrefix(const std::string & q,OutIt out,unsigned minOverlap)383 OutIt findOverlapPrefix(const std::string& q, OutIt out,
384 		unsigned minOverlap) const
385 {
386 	std::string s = q;
387 	std::transform(s.begin(), s.end(), s.begin(), Translate(*this));
388 	typedef std::string::const_iterator It;
389 	It first = s.begin();
390 	for (It it = first + minOverlap; it <= s.end(); ++it) {
391 		SAInterval sai = findExact(first, it,
392 				update(SAInterval(*this), 0));
393 		if (!sai.empty())
394 			*out++ = Match(sai.l, sai.u, 0, it - first);
395 	}
396 	return out;
397 }
398 
399 /** Search for a matching suffix of the query. */
400 template <typename It, typename MemoIt>
findSuffix(It first,It last,MemoIt memoIt)401 Match findSuffix(It first, It last, MemoIt memoIt) const
402 {
403 	assert(first < last);
404 
405 	SAInterval sai(*this);
406 	It it;
407 	for (it = last - 1; it >= first && !sai.empty(); --it) {
408 		T c = *it;
409 		if (c == SENTINEL())
410 			break;
411 		SAInterval sai1 = update(sai, c);
412 		if (sai1.empty())
413 			break;
414 		sai = sai1;
415 
416 		if (*memoIt == sai) {
417 			// This vertex of the prefix DAWG has been visited.
418 			break;
419 		}
420 		*memoIt++ = sai;
421 	}
422 	return Match(sai.l, sai.u, it - first + 1, last - first);
423 }
424 
425 /** Search for a matching substring of the query at least k long.
426  * @return the longest match
427  */
428 template <typename It>
findSubstring(It first,It last,unsigned k)429 Match findSubstring(It first, It last, unsigned k) const
430 {
431 	assert(first < last);
432 	Match best(0, 0, 0, k > 0 ? k - 1 : 0);
433 
434 	// Record which vertices of the prefix DAWG have been visited.
435 	std::vector<SAInterval> memo(last - first, SAInterval(0, 0));
436 	SAInterval* memoIt = &memo[0];
437 	for (It it = last; it > first; --it) {
438 		if (unsigned(it - first) < best.qspan())
439 			return best;
440 		Match interval = findSuffix(first, it, memoIt++);
441 		if (interval.qspan() > best.qspan())
442 			best = interval;
443 		else if (interval.qspan() == best.qspan())
444 			best.num++;
445 	}
446 	return best;
447 }
448 
449 /** Translate from ASCII to the indexed alphabet. */
450 struct Translate {
TranslateTranslate451 	Translate(const FMIndex& fmIndex) : m_fmIndex(fmIndex) { }
operatorTranslate452 	T operator()(unsigned char c) const
453 	{
454 		return c < m_fmIndex.m_mapping.size()
455 			? m_fmIndex.m_mapping[c] : SENTINEL();
456 	}
457   private:
458 	const FMIndex& m_fmIndex;
459 };
460 
461 /** Search for a matching substring of the query at least k long.
462  * @return the longest match and the number of matches
463  */
find(const std::string & q,unsigned k)464 Match find(const std::string& q, unsigned k) const
465 {
466 	std::string s = q;
467 	std::transform(s.begin(), s.end(), s.begin(), Translate(*this));
468 	return findSubstring(s.begin(), s.end(), k);
469 }
470 
471 /** Set the alphabet to [first, last).
472  * The character '\0' is treated specially and not included in the
473  * alphabet.
474  */
475 template <typename It>
setAlphabet(It first,It last)476 void setAlphabet(It first, It last)
477 {
478 	assert(first < last);
479 	std::vector<bool> mask(std::numeric_limits<T>::max());
480 	for (It it = first; it < last; ++it) {
481 		assert((size_t)*it < mask.size());
482 		mask[*it] = true;
483 	}
484 
485 	// Remove the character '\0' from the alphabet.
486 	mask[0] = false;
487 
488 	m_alphabet.clear();
489 	m_mapping.clear();
490 	for (unsigned c = 0; c < mask.size(); ++c) {
491 		if (!mask[c])
492 			continue;
493 		m_mapping.resize(c + 1, std::numeric_limits<T>::max());
494 		m_mapping[c] = m_alphabet.size();
495 		m_alphabet.push_back(c);
496 	}
497 	assert(!m_alphabet.empty());
498 }
499 
500 /** Set the alphabet to the characters of s. */
setAlphabet(const std::string & s)501 void setAlphabet(const std::string& s)
502 {
503 	assert(!s.empty());
504 	setAlphabet(s.begin(), s.end());
505 }
506 
507 #define STRINGIFY(X) #X
508 #define FM_VERSION_BITS(BITS) "FM " STRINGIFY(BITS) " 1"
509 #define FM_VERSION FM_VERSION_BITS(FMBITS)
510 
511 /** Store an index. */
512 friend std::ostream& operator<<(std::ostream& out, const FMIndex& o)
513 {
514 	out << FM_VERSION << '\n'
515 		<< o.m_sampleSA << '\n';
516 
517 	out << o.m_alphabet.size() << '\n';
518 	out.write(reinterpret_cast<const char*>(&o.m_alphabet[0]),
519 			o.m_alphabet.size() * sizeof o.m_alphabet[0]);
520 
521 	out << o.m_sa.size() << '\n';
522 	out.write(reinterpret_cast<const char*>(&o.m_sa[0]),
523 		o.m_sa.size() * sizeof o.m_sa[0]);
524 
525 	return out << o.m_occ;
526 }
527 
528 /** Load an index. */
529 friend std::istream& operator>>(std::istream& in, FMIndex& o)
530 {
531 	std::string version;
532 	std::getline(in, version);
533 	assert(in);
534 	if (version != FM_VERSION) {
535 		std::cerr << "error: the version of this FM-index, `"
536 			<< version << "', does not match the version required "
537 			"by this program, `" FM_VERSION "'.\n";
538 		exit(EXIT_FAILURE);
539 	}
540 
541 	in >> o.m_sampleSA;
542 	assert(in);
543 
544 	size_t n;
545 	in >> n >> expect("\n");
546 	assert(in);
547 	assert(n < std::numeric_limits<size_type>::max());
548 	o.m_alphabet.resize(n);
549 	in.read(reinterpret_cast<char*>(&o.m_alphabet[0]),
550 			n * sizeof o.m_alphabet[0]);
551 	o.setAlphabet(o.m_alphabet.begin(), o.m_alphabet.end());
552 
553 	in >> n >> expect("\n");
554 	assert(in);
555 	assert(n < std::numeric_limits<size_type>::max());
556 	o.m_sa.resize(n);
557 	in.read(reinterpret_cast<char*>(&o.m_sa[0]),
558 			n * sizeof o.m_sa[0]);
559 
560 	in >> o.m_occ;
561 	assert(in);
562 	o.countOccurrences();
563 
564 	return in;
565 }
566 
567 private:
568 
569 /** Build the cumulative frequency table m_cf from m_occ. */
countOccurrences()570 void countOccurrences()
571 {
572 	assert(!m_alphabet.empty());
573 	m_cf.resize(m_alphabet.size());
574 	// The sentinel character occurs once.
575 	m_cf[0] = 1;
576 	for (unsigned i = 0; i < m_cf.size() - 1; ++i)
577 		m_cf[i + 1] = m_cf[i] + m_occ.count(i);
578 }
579 
580 	unsigned m_sampleSA;
581 	std::vector<T> m_alphabet;
582 	std::vector<T> m_mapping;
583 	std::vector<size_type> m_cf;
584 	std::vector<size_type> m_sa;
585 	BitArrays m_occ;
586 };
587 
588 #endif
589