1 /*
2  * range_cache.h
3  *
4  * Classes that encapsulate the caching of
5  */
6 
7 #ifndef RANGE_CACHE_H_
8 #define RANGE_CACHE_H_
9 
10 #include <iostream>
11 #include <map>
12 #include <stdexcept>
13 #include <stdint.h>
14 #include <utility>
15 
16 #include "ds.h"
17 #include "ebwt.h"
18 #include "row_chaser.h"
19 
20 #define RANGE_NOT_SET OFF_MASK
21 #define RANGE_CACHE_BAD_ALLOC OFF_MASK
22 
23 /**
24  * Manages a pool of memory used exclusively for range cache entries.
25  * This manager is allocate-only; it exists mainly so that we can avoid
26  * lots of new[]s and delete[]s.
27  *
28  * A given stretch of words may be one of two types: a cache entry, or
29  * a cache entry wrapper.  A cache entry has a length and a list of
30  * already-resolved reference positions.  A cache entry wrapper has a
31  * pointer to a cache entry for a different range, along with an
32  * integer indicating how many "jumps" to the left that range is from
33  * the one that owns the wrapper.
34  */
35 class RangeCacheMemPool {
36 public:
RangeCacheMemPool(TIndexOffU lim)37 	RangeCacheMemPool(TIndexOffU lim /* max cache size in bytes */) :
38 		lim_(lim >> 2 /* convert to words */), occ_(0), buf_(NULL),
39 		closed_(false)
40 	{
41 		if(lim_ > 0) {
42 			try {
43 				buf_ = new TIndexOffU[lim_];
44 				if(buf_ == NULL) throw std::bad_alloc();
45 			} catch(std::bad_alloc& e) {
46 				cerr << "Allocation error allocating " << lim
47 					 << " words of range-cache memory" << endl;
48 				throw 1;
49 			}
50 			assert(buf_ != NULL);
51 			// Fill with 1s to signal that these elements are
52 			// uninitialized
53 			memset(buf_, 0xff, lim_ << 2 /* convert back to bytes */);
54 		}
55 	}
56 
~RangeCacheMemPool()57 	~RangeCacheMemPool() {
58 		// Release all word memory!
59 		if(lim_ > 0) delete[] buf_;
60 	}
61 
62 	/**
63 	 * Allocate numElts elements from the word pool.
64 	 */
alloc(TIndexOffU numElts)65 	TIndexOffU alloc(TIndexOffU numElts) {
66 		assert_gt(numElts, 0);
67 		assert_leq(occ_, lim_);
68 		if(occ_ + numElts > lim_ || numElts >= CACHE_WRAPPER_BIT) {
69 			return RANGE_CACHE_BAD_ALLOC;
70 		}
71 		assert_gt(lim_, 0);
72 		TIndexOffU ret = occ_;
73 		assert(allocs_.find(ret) == allocs_.end());
74 		ASSERT_ONLY(allocs_.insert(ret));
75 		// Clear the first elt so that we don't think there's already
76 		// something there
77 #ifndef NDEBUG
78 		for(TIndexOffU i = 0; i < numElts; i++) {
79 			assert_eq(OFF_MASK, buf_[occ_ + i]);
80 		}
81 #endif
82 		buf_[occ_] = 0;
83 		occ_ += numElts;
84 		assert_leq(occ_, lim_);
85 		if(lim_ - occ_ < 10) {
86 			// No more room - don't try anymore
87 			closed_ = true;
88 		}
89 		return ret;
90 	}
91 
92 	/**
93 	 * Turn a pool-array index into a pointer; check that it doesn't
94 	 * fall outside the pool first.
95 	 */
get(TIndexOffU off)96 	inline TIndexOffU *get(TIndexOffU off) {
97 		assert_gt(lim_, 0);
98 		assert_lt(off, lim_);
99 		assert(allocs_.find(off) != allocs_.end());
100 		TIndexOffU *ret = buf_ + off;
101 		assert_neq(CACHE_WRAPPER_BIT, ret[0]);
102 		assert_neq(OFF_MASK, ret[0]);
103 		return ret;
104 	}
105 
106 	/**
107 	 * Return true iff there's no more room in the cache.
108 	 */
closed()109 	inline bool closed() {
110 		return closed_;
111 	}
112 
113 private:
114 	TIndexOffU lim_;  /// limit on number of 32-bit words to dish out in total
115 	TIndexOffU occ_;  /// number of occupied words
116 	TIndexOffU *buf_; /// buffer of 32-bit words
117 	bool closed_;   ///
118 #ifndef NDEBUG
119 	std::set<TIndexOffU> allocs_; // elements allocated
120 #endif
121 };
122 
123 /**
124  * A view to a range of cached reference positions.
125  */
126 class RangeCacheEntry {
127 public:
128 	/**
129 	 *
130 	 */
131 	RangeCacheEntry(bool sanity = false) :
top_(OFF_MASK)132 		top_(OFF_MASK), jumps_(0), len_(0), ents_(NULL), ebwt_(NULL),
133 		sanity_(sanity)
134 	{ }
135 
136 	/**
137 	 * Create a new RangeCacheEntry from the data in the pool at 'ents'.
138 	 */
139 	RangeCacheEntry(RangeCacheMemPool& pool, TIndexOffU top,
140 			TIndexOffU ent, Ebwt* ebwt, bool sanity = false) :
sanity_(sanity)141 	    sanity_(sanity)
142 	{
143 		init(pool, top, ent, ebwt);
144 	}
145 
146 	/**
147 	 * Initialize a RangeCacheEntry from the data in the pool at 'ents'.
148 	 */
init(RangeCacheMemPool & pool,TIndexOffU top,TIndexOffU ent,Ebwt * ebwt)149 	void init(RangeCacheMemPool& pool, TIndexOffU top, TIndexOffU ent, Ebwt* ebwt) {
150 		assert(ebwt != NULL);
151 		top_ = top;
152 		ebwt_ = ebwt;
153 		TIndexOffU *ents = pool.get(ent);
154 		assert_neq(CACHE_WRAPPER_BIT, ents[0]);
155 		// Is hi bit set?
156 		if((ents[0] & CACHE_WRAPPER_BIT) != 0) {
157 			// If so, the target is a wrapper and the non-hi bits
158 			// contain the # jumps
159 			jumps_ = (ents[0] & ~CACHE_WRAPPER_BIT);
160 			assert_gt(jumps_, 0);
161 			assert_leq(jumps_, ebwt_->_eh._len);
162 			// Get the target entry
163 			TIndexOffU *dest = pool.get(ents[1]);
164 			// Get the length from the target entry
165 			len_ = dest[0];
166 			assert_leq(top_ + len_, ebwt_->_eh._len);
167 			assert_gt(len_, 0);
168 			assert_leq(len_, ebwt_->_eh._len);
169 			// Get the pointer to the entries themselves
170 			ents_ = dest + 1;
171 		} else {
172 			// Not a wrapper, so there are no jumps
173 			jumps_ = 0;
174 			// Get the length from the target entry
175 			len_  = ents[0];
176 			assert_leq(top_ + len_, ebwt_->_eh._len);
177 			assert_gt(len_, 0);
178 			assert_leq(len_, ebwt_->_eh._len);
179 			// Get the pointer to the entries themselves
180 			ents_ = ents + 1;
181 		}
182 		if (sanity_)
183 			assert(sanityCheckEnts());
184 	}
185 
186 	/**
187 	 * Initialize a wrapper with given number of jumps and given target
188 	 * entry index.
189 	 */
init(RangeCacheMemPool & pool,TIndexOffU top,TIndexOffU jumps,TIndexOffU ent,Ebwt * ebwt)190 	void init(RangeCacheMemPool& pool, TIndexOffU top, TIndexOffU jumps,
191 			TIndexOffU ent, Ebwt* ebwt)
192 	{
193 		assert(ebwt != NULL);
194 		ebwt_ = ebwt;
195 		top_ = top;
196 		jumps_ = jumps;
197 		TIndexOffU *ents = pool.get(ent);
198 		// Must not be a wrapper
199 		assert_eq(0, ents[0] & CACHE_WRAPPER_BIT);
200 		// Get the length from the target entry
201 		len_ = ents[0];
202 		assert_gt(len_, 0);
203 		assert_leq(len_, ebwt_->_eh._len);
204 		// Get the pointer to the entries themselves
205 		ents_ = ents + 1;
206 		assert_leq(top_ + len_, ebwt_->_eh._len);
207 		if (sanity_)
208 			assert(sanityCheckEnts());
209 	}
210 
len()211 	TIndexOffU len() const   {
212 		assert(ents_ != NULL);
213 		assert(ebwt_ != NULL);
214 		return len_;
215 	}
216 
jumps()217 	TIndexOffU jumps() const {
218 		assert(ents_ != NULL);
219 		assert(ebwt_ != NULL);
220 		return jumps_;
221 	}
222 
223 	/**
224 	 *
225 	 */
reset()226 	void reset() {
227 		ents_ = NULL;
228 	}
229 
230 	/**
231 	 * Return true iff this object represents a valid cache entry.
232 	 */
valid()233 	bool valid() const {
234 		return ents_ != NULL;
235 	}
236 
ebwt()237 	Ebwt *ebwt() {
238 		return ebwt_;
239 	}
240 
241 	/**
242 	 * Install a result obtained by a client of this cache; be sure to
243 	 * adjust for how many jumps down the tunnel the cache entry is
244 	 * situated.
245 	 */
install(TIndexOffU elt,TIndexOffU val)246 	void install(TIndexOffU elt, TIndexOffU val) {
247 		if(ents_ == NULL) {
248 			// This is not a valid cache entry; do nothing
249 			return;
250 		}
251 		assert(ents_ != NULL);
252 		assert(ebwt_ != NULL);
253 		assert_leq(jumps_, val);
254 		assert_neq(OFF_MASK, val);
255 		assert_leq(top_ + len_, ebwt_->_eh._len);
256 		if(elt < len_) {
257 			val -= jumps_;
258 			if(verbose_) cout << "Installed reference offset: " << (top_ + elt) << endl;
259 			ASSERT_ONLY(TIndexOffU sanity = RowChaser::toFlatRefOff(ebwt_, 1, top_ + elt));
260 			assert_eq(sanity, val);
261 #ifndef NDEBUG
262 			for(size_t i = 0; i < len_; i++) {
263 				if(i == elt) continue;
264 				assert_neq(val, ents_[i]);
265 			}
266 #endif
267 			ents_[elt] = val;
268 		} else {
269 			// ignore install request
270 			if(verbose_) cout << "Fell off end of cache entry for install: " << (top_ + elt) << endl;
271 		}
272 	}
273 
274 	/**
275 	 * Get an element from the cache, adjusted for tunnel jumps.
276 	 */
get(TIndexOffU elt)277 	inline TIndexOffU get(TIndexOffU elt) const {
278 		if(ents_ == NULL) {
279 			// This is not a valid cache entry; do nothing
280 			return RANGE_NOT_SET;
281 		}
282 		assert(ents_ != NULL);
283 		assert(ebwt_ != NULL);
284 		assert_leq(top_ + len_, ebwt_->_eh._len);
285 		if(elt < len_ && ents_[elt] != RANGE_NOT_SET) {
286 			if(verbose_) cout << "Retrieved result from cache: " << (top_ + elt) << endl;
287 			TIndexOffU ret = ents_[elt] + jumps_;
288 			ASSERT_ONLY(TIndexOffU sanity = RowChaser::toFlatRefOff(ebwt_, 1, top_ + elt));
289 			assert_eq(sanity, ret);
290 			return ret;
291 		} else {
292 			if(verbose_) cout << "Cache entry not set: " << (top_ + elt) << endl;
293 			return RANGE_NOT_SET;
294 		}
295 	}
296 
297 	/**
298 	 * Check that len_ and the ents_ array both make sense.
299 	 */
sanityCheckEnts(TIndexOffU len,TIndexOffU * ents,Ebwt * ebwt)300 	static bool sanityCheckEnts(TIndexOffU len, TIndexOffU *ents, Ebwt* ebwt) {
301 		assert_gt(len, 0);
302 		assert_leq(len, ebwt->_eh._len);
303 		if(len < 10) {
304 			for(size_t i = 0; i < len; i++) {
305 				if(ents[i] == OFF_MASK) continue;
306 				assert_leq(ents[i], ebwt->_eh._len);
307 				for(size_t j = i+1; j < len; j++) {
308 					if(ents[j] == OFF_MASK) continue;
309 					assert_neq(ents[i], ents[j]);
310 				}
311 			}
312 		} else {
313 			std::set<TIndexOffU> seen;
314 			for(size_t i = 0; i < len; i++) {
315 				if(ents[i] == OFF_MASK) continue;
316 				assert(seen.find(ents[i]) == seen.end());
317 				seen.insert(ents[i]);
318 			}
319 		}
320 		return true;
321 	}
322 
323 	/**
324 	 * Check that len_ and the ents_ array both make sense.
325 	 */
sanityCheckEnts()326 	bool sanityCheckEnts() {
327 		return RangeCacheEntry::sanityCheckEnts(len_, ents_, ebwt_);
328 	}
329 
330 private:
331 
332 	TIndexOffU top_;   /// top pointer for this range
333 	TIndexOffU jumps_; /// how many tunnel-jumps it is away from the requester
334 	TIndexOffU len_;   /// # of entries in cache entry
335 	TIndexOffU *ents_; /// ptr to entries, which are flat offs within joined ref
336 	Ebwt    *ebwt_; /// index that alignments are in
337 	bool     verbose_; /// be talkative?
338 	bool     sanity_;  /// do consistency checks?
339 };
340 
341 /**
342  *
343  */
344 class RangeCache {
345 	typedef EList<TIndexOffU> TUVec;
346 	typedef std::map<TIndexOffU, TIndexOffU> TMap;
347 	typedef std::map<TIndexOffU, TIndexOffU>::iterator TMapItr;
348 
349 public:
RangeCache(TIndexOffU lim,Ebwt * ebwt)350 	RangeCache(TIndexOffU lim, Ebwt* ebwt) :
351 		lim_(lim), map_(), pool_(lim), closed_(false), ebwt_(ebwt), sanity_(true) { }
352 
353 	/**
354 	 * Given top and bot offsets, retrieve the canonical cache entry
355 	 * that best covers that range.  The cache entry may not directly
356 	 * correspond to the top offset provided, rather, it might be an
357 	 * entry that lies "at the end of the tunnel" when top and bot are
358 	 * walked backward.
359 	 */
lookup(TIndexOffU top,TIndexOffU bot,RangeCacheEntry & ent)360 	bool lookup(TIndexOffU top, TIndexOffU bot, RangeCacheEntry& ent) {
361 		if(ebwt_ == NULL || lim_ == 0) return false;
362 		assert_gt(bot, top);
363 		ent.reset();
364 		TMapItr itr = map_.find(top);
365 		if(itr == map_.end()) {
366 			// No cache entry for the given 'top' offset
367 			if(closed_) {
368 				return false; // failed to get cache entry
369 			} else {
370 				if(pool_.closed()) {
371 					closed_ = true;
372 					return false; // failed to get cache entry
373 				}
374 			}
375 			// Use the tunnel
376 			bool ret = tunnel(top, bot, ent);
377 			return ret;
378 		} else {
379 			// There is a cache entry for the given 'top' offset
380 			TIndexOffU ret = itr->second;
381 			ent.init(pool_, top, ret, ebwt_);
382 			return true; // success
383 		}
384 	}
385 
386 	/**
387 	 * Exhaustively check all entries linked to from map_ to ensure
388 	 * they're well-formed.
389 	 */
repOk()390 	bool repOk() {
391 #ifndef NDEBUG
392 		for(TMapItr itr = map_.begin(); itr != map_.end(); itr++) {
393 			TIndexOffU top = itr->first;
394 			TIndexOffU idx = itr->second;
395 			TIndexOffU jumps = 0;
396 			assert_leq(top, ebwt_->_eh._len);
397 			TIndexOffU *ents = pool_.get(idx);
398 			if((ents[0] & CACHE_WRAPPER_BIT) != 0) {
399 				jumps = ents[0] & ~CACHE_WRAPPER_BIT;
400 				assert_leq(jumps, ebwt_->_eh._len);
401 				idx = ents[1];
402 				ents = pool_.get(idx);
403 			}
404 			TIndexOffU len = ents[0];
405 			assert_leq(top + len, ebwt_->_eh._len);
406 			if (sanity_)
407 				RangeCacheEntry::sanityCheckEnts(len, ents + 1, ebwt_);
408 		}
409 #endif
410 		return true;
411 	}
412 
413 protected:
414 
415 	/**
416 	 * Tunnel through to the first range that 1) includes all the same
417 	 * suffixes (though longer) as the given range, and 2) has a cache
418 	 * entry for it.
419 	 */
tunnel(TIndexOffU top,TIndexOffU bot,RangeCacheEntry & ent)420 	bool tunnel(TIndexOffU top, TIndexOffU bot, RangeCacheEntry& ent) {
421 		assert_gt(bot, top);
422 		TUVec tops;
423 		const TIndexOffU spread = bot - top;
424 		SideLocus tloc, bloc;
425 		SideLocus::initFromTopBot(top, bot, ebwt_->_eh, ebwt_->_ebwt, tloc, bloc);
426 		TIndexOffU newtop = top, newbot = bot;
427 		TIndexOffU jumps = 0;
428 		// Walk left through the tunnel
429 		while(true) {
430 			if(ebwt_->rowL(tloc) != ebwt_->rowL(bloc)) {
431 				// Different characters at top and bot positions of
432 				// BWT; this means that the calls to mapLF below are
433 				// guaranteed to yield rows in two different character-
434 				// sections of the BWT.
435 				break;
436 			}
437 			// Advance top and bot
438 			newtop = ebwt_->mapLF(tloc);
439 			newbot = ebwt_->mapLF(bloc);
440 			assert_geq(newbot, newtop);
441 			assert_leq(newbot - newtop, spread);
442 			// If the new spread is the same as the old spread, we can
443 			// be confident that the new range includes all of the same
444 			// suffixes as the last range (though longer by 1 char)
445 			if((newbot - newtop) == spread) {
446 				// Check if newtop is already cached
447 				TMapItr itr = map_.find(newtop);
448 				jumps++;
449 				if(itr != map_.end()) {
450 					// This range, which is further to the left in the
451 					// same tunnel as the query range, has a cache
452 					// entry already, so use that
453 					TIndexOffU idx = itr->second;
454 					TIndexOffU *ents = pool_.get(idx);
455 					if((ents[0] & CACHE_WRAPPER_BIT) != 0) {
456 						// The cache entry we found was a wrapper; make
457 						// a new wrapper that points to that wrapper's
458 						// target, with the appropriate number of jumps
459 						jumps += (ents[0] & ~CACHE_WRAPPER_BIT);
460 						idx = ents[1];
461 					}
462 					// Allocate a new wrapper
463 					TIndexOffU newentIdx = pool_.alloc(2);
464 					if(newentIdx != RANGE_CACHE_BAD_ALLOC) {
465 						// We successfully made a new wrapper entry;
466 						// now populate it and install it in map_
467 						TIndexOffU *newent = pool_.get(newentIdx); // get ptr to it
468 						assert_eq(0, newent[0]);
469 						newent[0] = CACHE_WRAPPER_BIT | jumps; // set jumps
470 						newent[1] = idx;                // set target
471 						assert(map_.find(top) == map_.end());
472 						map_[top] = newentIdx;
473 						if(sanity_) assert(repOk());
474 					}
475 					// Initialize the entry
476 					ent.init(pool_, top, jumps, idx, ebwt_);
477 					return true;
478 				}
479 				// Save this range
480 				tops.push_back(newtop);
481 				SideLocus::initFromTopBot(newtop, newbot, ebwt_->_eh, ebwt_->_ebwt, tloc, bloc);
482 			} else {
483 				// Not all the suffixes were preserved, so we can't
484 				// link the source range's cached result to this
485 				// range's cached results
486 				break;
487 			}
488 			assert_eq(jumps, tops.size());
489 		}
490 		assert_eq(jumps, tops.size());
491 		// Try to create a new cache entry for the leftmost range in
492 		// the tunnel (which might be the query range)
493 		TIndexOffU newentIdx = pool_.alloc(spread + 1);
494 		if(newentIdx != RANGE_CACHE_BAD_ALLOC) {
495 			// Successfully allocated new range cache entry; install it
496 			TIndexOffU *newent = pool_.get(newentIdx);
497 			assert_eq(0, newent[0]);
498 			// Store cache-range length in first word
499 			newent[0] = spread;
500 			assert_lt(newent[0], CACHE_WRAPPER_BIT);
501 			assert_eq(spread, newent[0]);
502 			TIndexOffU entTop = top;
503 			TIndexOffU jumps = 0;
504 			if(tops.size() > 0) {
505 				entTop = tops.back();
506 				jumps = (TIndexOff)tops.size();
507 			}
508 			// Cache the entry for the end of the tunnel
509 			assert(map_.find(entTop) == map_.end());
510 			map_[entTop] = newentIdx;
511 			if(sanity_) assert(repOk());
512 			ent.init(pool_, entTop, jumps, newentIdx, ebwt_);
513 			assert_eq(spread, newent[0]);
514 			if(jumps > 0) {
515 				assert_neq(entTop, top);
516 				// Cache a wrapper entry for the query range (if possible)
517 				TIndexOffU wrapentIdx = pool_.alloc(2);
518 				if(wrapentIdx != RANGE_CACHE_BAD_ALLOC) {
519 					TIndexOffU *wrapent = pool_.get(wrapentIdx);
520 					assert_eq(0, wrapent[0]);
521 					wrapent[0] = CACHE_WRAPPER_BIT | jumps;
522 					wrapent[1] = newentIdx;
523 					assert(map_.find(top) == map_.end());
524 					map_[top] = wrapentIdx;
525 					if(sanity_) assert(repOk());
526 				}
527 			}
528 			return true;
529 		} else {
530 			// Could not allocate new range cache entry
531 			return false;
532 		}
533 	}
534 
535 	TIndexOffU lim_;           /// Total number of key/val bytes to keep in cache
536 	TMap map_;               ///
537 	RangeCacheMemPool pool_; /// Memory pool
538 	bool closed_;            /// Out of space; no new entries
539 	Ebwt* ebwt_;            /// Index that alignments are in
540 	bool sanity_;
541 };
542 
543 #endif /* RANGE_CACHE_H_ */
544