1 #ifndef FMINDEX_H
2 #define FMINDEX_H 1
3
4 #include "config.h"
5 #include "BitArrays.h"
6 #include "IOUtil.h"
7 #include "sais.hxx"
8 #include <boost/integer.hpp>
9 #include <algorithm>
10 #include <cassert>
11 #include <cstdlib> // for exit
12 #include <iostream>
13 #include <iterator>
14 #include <limits> // for numeric_limits
15 #include <stdint.h>
16 #include <string>
17 #include <vector>
18
19 /** An FM index. */
20 class FMIndex
21 {
22 /** A symbol. */
23 typedef uint8_t T;
24
25 /** The sentinel symbol. */
SENTINEL()26 static T SENTINEL() { return std::numeric_limits<T>::max(); }
27
28 public:
29 /** An index. */
30 typedef boost::uint_t<FMBITS>::least size_type;
31
32 /** An index for SAIS, which must be signed. */
33 typedef boost::int_t<FMBITS>::least sais_size_type;
34
35 /** The type of a symbol. */
36 typedef T value_type;
37
38 /** A suffix array interval. */
39 struct SAInterval
40 {
41 size_type l, u;
SAIntervalSAInterval42 SAInterval(size_type l, size_type u) : l(l), u(u) { }
SAIntervalSAInterval43 SAInterval(const FMIndex& fm) : l(1), u(fm.m_occ.size()) { }
emptySAInterval44 bool empty() const { return l >= u; }
45 bool operator==(const SAInterval& x) const
46 {
47 return x.l == l && x.u == u;
48 }
49
sizeSAInterval50 size_type size() const
51 {
52 assert(l <= u);
53 return u - l;
54 }
55 };
56
57 /** A match of a substring of a query sequence to an FM index. */
58 struct Match : public SAInterval
59 {
60 unsigned qstart, qend, num;
61
MatchMatch62 Match() : SAInterval(0, 0), qstart(0), qend(0), num(0) { }
63
64 Match(size_type l, size_type u, unsigned qstart, unsigned qend,
65 unsigned num = 1)
SAIntervalMatch66 : SAInterval(l, u), qstart(qstart), qend(qend), num(num) { }
67
qspanMatch68 unsigned qspan() const
69 {
70 assert(qstart <= qend);
71 return qend - qstart;
72 }
73 };
74
FMIndex()75 FMIndex() : m_sampleSA(1) { }
76
77 /** Return the size of the string not counting the sentinel. */
size()78 size_t size() const { return m_occ.size() - 1; }
79
80 /** The size of the alphabet. */
alphabetSize()81 unsigned alphabetSize() const { return m_alphabet.size(); }
82
83 /** Encode the alphabet of [first, last). */
84 template <typename It>
encode(It first,It last)85 void encode(It first, It last) const
86 {
87 assert(first < last);
88 assert(!m_alphabet.empty());
89 std::transform(first, last, first, Translate(*this));
90 }
91
92 /** Decode the alphabet of [first, last). */
93 template <typename It>
decode(It first,It last)94 void decode(It first, It last) const
95 {
96 assert(first < last);
97 assert(!m_alphabet.empty());
98 for (It it = first; it < last; ++it)
99 *it = m_alphabet[*it];
100 }
101
102 /** Build the BWT of [first, last).
103 * The BWT including the sentinel is stored in [first, last].
104 * @return the position of the sentinel
105 */
106 template<typename It>
buildBWT(It first,It last)107 size_type buildBWT(It first, It last) const
108 {
109 assert(first < last);
110 assert(size_t(last - first)
111 < std::numeric_limits<size_type>::max());
112 encode(first, last);
113 std::replace(first, last, SENTINEL(), T(0));
114
115 std::cerr << "Building the Burrows-Wheeler transform...\n";
116 size_t n = last - first;
117 std::vector<sais_size_type> sa(n);
118 assert(sizeof (size_type) == sizeof (sais_size_type));
119 sais_size_type sentinel = saisxx_bwt(first, first,
120 &sa[0],
121 (sais_size_type)n,
122 (sais_size_type)m_alphabet.size());
123 assert(sentinel >= 0);
124 if (sentinel < 0)
125 abort();
126 // Insert the sentinel.
127 std::copy_backward(first + sentinel, last, last + 1);
128 first[sentinel] = SENTINEL();
129 return sentinel;
130 }
131
132 /** Set the specified element of the sampled suffix array. */
setSA(size_t sai,size_t pos)133 void setSA(size_t sai, size_t pos)
134 {
135 if (sai % m_sampleSA == 0) {
136 size_t i = sai / m_sampleSA;
137 assert(i < m_sa.size());
138 m_sa[i] = pos;
139 }
140 }
141
142 /** Construct the suffix array from the FM index. */
constructSuffixArray()143 void constructSuffixArray()
144 {
145 // The length of the original string.
146 size_t n = m_occ.size() - 1;
147 assert(n > 0);
148 assert(m_sampleSA > 0);
149 m_sa.resize(n / m_sampleSA + 1);
150 size_t sai = 0;
151 for (size_t i = n; i > 0; i--) {
152 setSA(sai, i);
153 T c = m_occ.at(sai);
154 assert(c != SENTINEL());
155 sai = m_cf[c] + m_occ.rank(c, sai);
156 assert(sai > 0);
157 }
158 setSA(sai, 0);
159 }
160
161 /** Build an FM-index of the specified BWT. */
162 template<typename It>
assignBWT(It first,It last)163 void assignBWT(It first, It last)
164 {
165 assert(last - first > 1);
166 assert(size_t(last - first)
167 < std::numeric_limits<size_type>::max());
168
169 std::cerr << "Building the character occurrence table...\n";
170 m_occ.assign(first, last);
171 countOccurrences();
172
173 // Construct the suffix array from the FM index.
174 std::cerr << "Building the suffix array...\n";
175 constructSuffixArray();
176 }
177
178 /** Build an FM-index of the specified data. */
179 template<typename It>
assign(It first,It last)180 void assign(It first, It last)
181 {
182 assert(first < last);
183 assert(size_t(last - first)
184 < std::numeric_limits<size_type>::max());
185
186 encode(first, last);
187 std::replace(first, last, SENTINEL(), T(0));
188
189 // Construct the suffix array.
190 std::cerr << "Building the suffix array...\n";
191 size_t n = last - first;
192 m_sampleSA = 1;
193 m_sa.resize(n + 1);
194 m_sa[0] = n;
195
196 assert(sizeof (size_type) == sizeof (sais_size_type));
197 int status = saisxx(first,
198 reinterpret_cast<sais_size_type*>(&m_sa[1]),
199 (sais_size_type)n,
200 (sais_size_type)m_alphabet.size());
201 assert(status == 0);
202 if (status != 0)
203 abort();
204
205 // Construct the Burrows-Wheeler transform.
206 std::vector<T> bwt;
207 std::cerr << "Building the Burrows-Wheeler transform...\n";
208 bwt.resize(m_sa.size());
209 for (size_t i = 0; i < m_sa.size(); i++)
210 bwt[i] = m_sa[i] == 0 ? SENTINEL() : first[m_sa[i] - 1];
211
212 std::cerr << "Building the character occurrence table...\n";
213 m_occ.assign(bwt.begin(), bwt.end());
214 countOccurrences();
215 }
216
217 /** Sample the suffix array. */
sampleSA(unsigned period)218 void sampleSA(unsigned period)
219 {
220 assert(period > 0);
221 if (period == m_sampleSA)
222 return;
223 assert(m_sampleSA == 1);
224 m_sampleSA = period;
225 if (m_sampleSA == 1 || m_sa.empty())
226 return;
227 std::vector<size_type>::iterator out = m_sa.begin();
228 for (size_t i = 0; i < m_sa.size(); i += m_sampleSA)
229 *out++ = m_sa[i];
230 m_sa.erase(out, m_sa.end());
231 assert(!m_sa.empty());
232 }
233
234 /** Return the specified element of the suffix array. */
at(size_t i)235 size_t at(size_t i) const
236 {
237 assert(i < m_occ.size());
238 size_t n = 0;
239 while (i % m_sampleSA != 0) {
240 T c = m_occ.at(i);
241 i = c == SENTINEL() ? 0 : m_cf[c] + m_occ.rank(c, i);
242 n++;
243 }
244 assert(i / m_sampleSA < m_sa.size());
245 size_t pos = m_sa[i / m_sampleSA] + n;
246 return pos < m_occ.size() ? pos : pos - m_occ.size();
247 }
248
249 /** Return the specified element of the suffix array. */
250 size_t operator[](size_t i) const
251 {
252 return at(i);
253 }
254
255 /** Return the symbol at the specifed position of the BWT. */
bwtAt(size_t i)256 T bwtAt(size_t i) const
257 {
258 assert(i < m_occ.size());
259 T c = m_occ.at(i);
260 assert(c != SENTINEL());
261 assert(c < m_alphabet.size());
262 return m_alphabet[c];
263 }
264
265 /** Return the first symbol of the specified suffix. */
symbolAt(size_t i)266 T symbolAt(size_t i) const
267 {
268 assert(i < m_occ.size());
269 std::vector<size_type>::const_iterator it
270 = std::upper_bound(m_cf.begin(), m_cf.end(), i);
271 assert(it != m_cf.begin());
272 T c = it - m_cf.begin() - 1;
273 assert(c < m_alphabet.size());
274 return m_alphabet[c];
275 }
276
277 /** Decompress the index. */
278 template <typename It>
decompress(It out)279 void decompress(It out)
280 {
281 // Construct the original string.
282 std::vector<T> s;
283 for (size_t i = 0;;) {
284 assert(i < m_occ.size());
285 T c = m_occ.at(i);
286 if (c == SENTINEL())
287 break;
288 s.push_back(c);
289 i = m_cf[c] + m_occ.rank(c, i);
290 assert(i > 0);
291 }
292
293 // Translate the character set and output the result.
294 for (std::vector<T>::reverse_iterator it = s.rbegin();
295 it != s.rend(); ++it) {
296 T c = *it;
297 assert(c < m_alphabet.size());
298 *out++ = m_alphabet[c];
299 }
300 }
301
302 /** Extend a suffix array coordinate by one character to the left. */
update(size_type i,T c)303 size_type update(size_type i, T c) const
304 {
305 assert(c < m_cf.size());
306 return m_cf[c] + m_occ.rank(c, i);
307 }
308
309 /** Extend a suffix array interval by one character to the left. */
update(SAInterval sai,T c)310 SAInterval update(SAInterval sai, T c) const
311 {
312 return SAInterval(update(sai.l, c), update(sai.u, c));
313 }
314
315 /** Search for an exact match. */
316 template <typename It>
findExact(It first,It last,SAInterval sai)317 SAInterval findExact(It first, It last, SAInterval sai) const
318 {
319 assert(first < last);
320 for (It it = last - 1; it >= first && !sai.empty(); --it) {
321 T c = *it;
322 if (c == SENTINEL())
323 return SAInterval(0, 0);
324 sai = update(sai, c);
325 }
326 return sai;
327 }
328
329 /** Search for an exact match. */
330 template <typename String>
findExact(const String & q)331 SAInterval findExact(const String& q) const
332 {
333 String s(q.size());
334 std::transform(q.begin(), q.end(), s.begin(), Translate(*this));
335 return findExact(s.begin(), s.end());
336 }
337
338 /** Search for a suffix of the query that matches a prefix of the
339 * target.
340 */
341 template <typename It, typename OutIt>
findOverlapSuffix(It first,It last,OutIt out,unsigned minOverlap)342 OutIt findOverlapSuffix(It first, It last, OutIt out,
343 unsigned minOverlap) const
344 {
345 assert(first < last);
346
347 SAInterval sai(*this);
348 for (It it = last - 1; it >= first && !sai.empty(); --it) {
349 T c = *it;
350 if (c == SENTINEL())
351 break;
352 sai = update(sai, c);
353 if (sai.empty())
354 break;
355
356 if (unsigned(last - it) < minOverlap)
357 continue;
358
359 SAInterval sai1 = update(sai, 0);
360 if (!sai1.empty())
361 *out++ = Match(sai1.l, sai1.u,
362 it - first, last - first);
363 }
364 return out;
365 }
366
367 /** Search for a suffix of the query that matches a prefix of the
368 * target.
369 */
370 template <typename OutIt>
findOverlapSuffix(const std::string & q,OutIt out,unsigned minOverlap)371 OutIt findOverlapSuffix(const std::string& q, OutIt out,
372 unsigned minOverlap) const
373 {
374 std::string s = q;
375 std::transform(s.begin(), s.end(), s.begin(), Translate(*this));
376 return findOverlapSuffix(s.begin(), s.end(), out, minOverlap);
377 }
378
379 /** Search for a prefix of the query that matches a suffix of the
380 * target.
381 */
382 template <typename OutIt>
findOverlapPrefix(const std::string & q,OutIt out,unsigned minOverlap)383 OutIt findOverlapPrefix(const std::string& q, OutIt out,
384 unsigned minOverlap) const
385 {
386 std::string s = q;
387 std::transform(s.begin(), s.end(), s.begin(), Translate(*this));
388 typedef std::string::const_iterator It;
389 It first = s.begin();
390 for (It it = first + minOverlap; it <= s.end(); ++it) {
391 SAInterval sai = findExact(first, it,
392 update(SAInterval(*this), 0));
393 if (!sai.empty())
394 *out++ = Match(sai.l, sai.u, 0, it - first);
395 }
396 return out;
397 }
398
399 /** Search for a matching suffix of the query. */
400 template <typename It, typename MemoIt>
findSuffix(It first,It last,MemoIt memoIt)401 Match findSuffix(It first, It last, MemoIt memoIt) const
402 {
403 assert(first < last);
404
405 SAInterval sai(*this);
406 It it;
407 for (it = last - 1; it >= first && !sai.empty(); --it) {
408 T c = *it;
409 if (c == SENTINEL())
410 break;
411 SAInterval sai1 = update(sai, c);
412 if (sai1.empty())
413 break;
414 sai = sai1;
415
416 if (*memoIt == sai) {
417 // This vertex of the prefix DAWG has been visited.
418 break;
419 }
420 *memoIt++ = sai;
421 }
422 return Match(sai.l, sai.u, it - first + 1, last - first);
423 }
424
425 /** Search for a matching substring of the query at least k long.
426 * @return the longest match
427 */
428 template <typename It>
findSubstring(It first,It last,unsigned k)429 Match findSubstring(It first, It last, unsigned k) const
430 {
431 assert(first < last);
432 Match best(0, 0, 0, k > 0 ? k - 1 : 0);
433
434 // Record which vertices of the prefix DAWG have been visited.
435 std::vector<SAInterval> memo(last - first, SAInterval(0, 0));
436 SAInterval* memoIt = &memo[0];
437 for (It it = last; it > first; --it) {
438 if (unsigned(it - first) < best.qspan())
439 return best;
440 Match interval = findSuffix(first, it, memoIt++);
441 if (interval.qspan() > best.qspan())
442 best = interval;
443 else if (interval.qspan() == best.qspan())
444 best.num++;
445 }
446 return best;
447 }
448
449 /** Translate from ASCII to the indexed alphabet. */
450 struct Translate {
TranslateTranslate451 Translate(const FMIndex& fmIndex) : m_fmIndex(fmIndex) { }
operatorTranslate452 T operator()(unsigned char c) const
453 {
454 return c < m_fmIndex.m_mapping.size()
455 ? m_fmIndex.m_mapping[c] : SENTINEL();
456 }
457 private:
458 const FMIndex& m_fmIndex;
459 };
460
461 /** Search for a matching substring of the query at least k long.
462 * @return the longest match and the number of matches
463 */
find(const std::string & q,unsigned k)464 Match find(const std::string& q, unsigned k) const
465 {
466 std::string s = q;
467 std::transform(s.begin(), s.end(), s.begin(), Translate(*this));
468 return findSubstring(s.begin(), s.end(), k);
469 }
470
471 /** Set the alphabet to [first, last).
472 * The character '\0' is treated specially and not included in the
473 * alphabet.
474 */
475 template <typename It>
setAlphabet(It first,It last)476 void setAlphabet(It first, It last)
477 {
478 assert(first < last);
479 std::vector<bool> mask(std::numeric_limits<T>::max());
480 for (It it = first; it < last; ++it) {
481 assert((size_t)*it < mask.size());
482 mask[*it] = true;
483 }
484
485 // Remove the character '\0' from the alphabet.
486 mask[0] = false;
487
488 m_alphabet.clear();
489 m_mapping.clear();
490 for (unsigned c = 0; c < mask.size(); ++c) {
491 if (!mask[c])
492 continue;
493 m_mapping.resize(c + 1, std::numeric_limits<T>::max());
494 m_mapping[c] = m_alphabet.size();
495 m_alphabet.push_back(c);
496 }
497 assert(!m_alphabet.empty());
498 }
499
500 /** Set the alphabet to the characters of s. */
setAlphabet(const std::string & s)501 void setAlphabet(const std::string& s)
502 {
503 assert(!s.empty());
504 setAlphabet(s.begin(), s.end());
505 }
506
507 #define STRINGIFY(X) #X
508 #define FM_VERSION_BITS(BITS) "FM " STRINGIFY(BITS) " 1"
509 #define FM_VERSION FM_VERSION_BITS(FMBITS)
510
511 /** Store an index. */
512 friend std::ostream& operator<<(std::ostream& out, const FMIndex& o)
513 {
514 out << FM_VERSION << '\n'
515 << o.m_sampleSA << '\n';
516
517 out << o.m_alphabet.size() << '\n';
518 out.write(reinterpret_cast<const char*>(&o.m_alphabet[0]),
519 o.m_alphabet.size() * sizeof o.m_alphabet[0]);
520
521 out << o.m_sa.size() << '\n';
522 out.write(reinterpret_cast<const char*>(&o.m_sa[0]),
523 o.m_sa.size() * sizeof o.m_sa[0]);
524
525 return out << o.m_occ;
526 }
527
528 /** Load an index. */
529 friend std::istream& operator>>(std::istream& in, FMIndex& o)
530 {
531 std::string version;
532 std::getline(in, version);
533 assert(in);
534 if (version != FM_VERSION) {
535 std::cerr << "error: the version of this FM-index, `"
536 << version << "', does not match the version required "
537 "by this program, `" FM_VERSION "'.\n";
538 exit(EXIT_FAILURE);
539 }
540
541 in >> o.m_sampleSA;
542 assert(in);
543
544 size_t n;
545 in >> n >> expect("\n");
546 assert(in);
547 assert(n < std::numeric_limits<size_type>::max());
548 o.m_alphabet.resize(n);
549 in.read(reinterpret_cast<char*>(&o.m_alphabet[0]),
550 n * sizeof o.m_alphabet[0]);
551 o.setAlphabet(o.m_alphabet.begin(), o.m_alphabet.end());
552
553 in >> n >> expect("\n");
554 assert(in);
555 assert(n < std::numeric_limits<size_type>::max());
556 o.m_sa.resize(n);
557 in.read(reinterpret_cast<char*>(&o.m_sa[0]),
558 n * sizeof o.m_sa[0]);
559
560 in >> o.m_occ;
561 assert(in);
562 o.countOccurrences();
563
564 return in;
565 }
566
567 private:
568
569 /** Build the cumulative frequency table m_cf from m_occ. */
countOccurrences()570 void countOccurrences()
571 {
572 assert(!m_alphabet.empty());
573 m_cf.resize(m_alphabet.size());
574 // The sentinel character occurs once.
575 m_cf[0] = 1;
576 for (unsigned i = 0; i < m_cf.size() - 1; ++i)
577 m_cf[i + 1] = m_cf[i] + m_occ.count(i);
578 }
579
580 unsigned m_sampleSA;
581 std::vector<T> m_alphabet;
582 std::vector<T> m_mapping;
583 std::vector<size_type> m_cf;
584 std::vector<size_type> m_sa;
585 BitArrays m_occ;
586 };
587
588 #endif
589