1 /* 2 * range_cache.h 3 * 4 * Classes that encapsulate the caching of 5 */ 6 7 #ifndef RANGE_CACHE_H_ 8 #define RANGE_CACHE_H_ 9 10 #include <iostream> 11 #include <map> 12 #include <stdexcept> 13 #include <stdint.h> 14 #include <utility> 15 16 #include "ds.h" 17 #include "ebwt.h" 18 #include "row_chaser.h" 19 20 #define RANGE_NOT_SET OFF_MASK 21 #define RANGE_CACHE_BAD_ALLOC OFF_MASK 22 23 /** 24 * Manages a pool of memory used exclusively for range cache entries. 25 * This manager is allocate-only; it exists mainly so that we can avoid 26 * lots of new[]s and delete[]s. 27 * 28 * A given stretch of words may be one of two types: a cache entry, or 29 * a cache entry wrapper. A cache entry has a length and a list of 30 * already-resolved reference positions. A cache entry wrapper has a 31 * pointer to a cache entry for a different range, along with an 32 * integer indicating how many "jumps" to the left that range is from 33 * the one that owns the wrapper. 34 */ 35 class RangeCacheMemPool { 36 public: RangeCacheMemPool(TIndexOffU lim)37 RangeCacheMemPool(TIndexOffU lim /* max cache size in bytes */) : 38 lim_(lim >> 2 /* convert to words */), occ_(0), buf_(NULL), 39 closed_(false) 40 { 41 if(lim_ > 0) { 42 try { 43 buf_ = new TIndexOffU[lim_]; 44 if(buf_ == NULL) throw std::bad_alloc(); 45 } catch(std::bad_alloc& e) { 46 cerr << "Allocation error allocating " << lim 47 << " words of range-cache memory" << endl; 48 throw 1; 49 } 50 assert(buf_ != NULL); 51 // Fill with 1s to signal that these elements are 52 // uninitialized 53 memset(buf_, 0xff, lim_ << 2 /* convert back to bytes */); 54 } 55 } 56 ~RangeCacheMemPool()57 ~RangeCacheMemPool() { 58 // Release all word memory! 59 if(lim_ > 0) delete[] buf_; 60 } 61 62 /** 63 * Allocate numElts elements from the word pool. 64 */ alloc(TIndexOffU numElts)65 TIndexOffU alloc(TIndexOffU numElts) { 66 assert_gt(numElts, 0); 67 assert_leq(occ_, lim_); 68 if(occ_ + numElts > lim_ || numElts >= CACHE_WRAPPER_BIT) { 69 return RANGE_CACHE_BAD_ALLOC; 70 } 71 assert_gt(lim_, 0); 72 TIndexOffU ret = occ_; 73 assert(allocs_.find(ret) == allocs_.end()); 74 ASSERT_ONLY(allocs_.insert(ret)); 75 // Clear the first elt so that we don't think there's already 76 // something there 77 #ifndef NDEBUG 78 for(TIndexOffU i = 0; i < numElts; i++) { 79 assert_eq(OFF_MASK, buf_[occ_ + i]); 80 } 81 #endif 82 buf_[occ_] = 0; 83 occ_ += numElts; 84 assert_leq(occ_, lim_); 85 if(lim_ - occ_ < 10) { 86 // No more room - don't try anymore 87 closed_ = true; 88 } 89 return ret; 90 } 91 92 /** 93 * Turn a pool-array index into a pointer; check that it doesn't 94 * fall outside the pool first. 95 */ get(TIndexOffU off)96 inline TIndexOffU *get(TIndexOffU off) { 97 assert_gt(lim_, 0); 98 assert_lt(off, lim_); 99 assert(allocs_.find(off) != allocs_.end()); 100 TIndexOffU *ret = buf_ + off; 101 assert_neq(CACHE_WRAPPER_BIT, ret[0]); 102 assert_neq(OFF_MASK, ret[0]); 103 return ret; 104 } 105 106 /** 107 * Return true iff there's no more room in the cache. 108 */ closed()109 inline bool closed() { 110 return closed_; 111 } 112 113 private: 114 TIndexOffU lim_; /// limit on number of 32-bit words to dish out in total 115 TIndexOffU occ_; /// number of occupied words 116 TIndexOffU *buf_; /// buffer of 32-bit words 117 bool closed_; /// 118 #ifndef NDEBUG 119 std::set<TIndexOffU> allocs_; // elements allocated 120 #endif 121 }; 122 123 /** 124 * A view to a range of cached reference positions. 125 */ 126 class RangeCacheEntry { 127 public: 128 /** 129 * 130 */ 131 RangeCacheEntry(bool sanity = false) : top_(OFF_MASK)132 top_(OFF_MASK), jumps_(0), len_(0), ents_(NULL), ebwt_(NULL), 133 sanity_(sanity) 134 { } 135 136 /** 137 * Create a new RangeCacheEntry from the data in the pool at 'ents'. 138 */ 139 RangeCacheEntry(RangeCacheMemPool& pool, TIndexOffU top, 140 TIndexOffU ent, Ebwt* ebwt, bool sanity = false) : sanity_(sanity)141 sanity_(sanity) 142 { 143 init(pool, top, ent, ebwt); 144 } 145 146 /** 147 * Initialize a RangeCacheEntry from the data in the pool at 'ents'. 148 */ init(RangeCacheMemPool & pool,TIndexOffU top,TIndexOffU ent,Ebwt * ebwt)149 void init(RangeCacheMemPool& pool, TIndexOffU top, TIndexOffU ent, Ebwt* ebwt) { 150 assert(ebwt != NULL); 151 top_ = top; 152 ebwt_ = ebwt; 153 TIndexOffU *ents = pool.get(ent); 154 assert_neq(CACHE_WRAPPER_BIT, ents[0]); 155 // Is hi bit set? 156 if((ents[0] & CACHE_WRAPPER_BIT) != 0) { 157 // If so, the target is a wrapper and the non-hi bits 158 // contain the # jumps 159 jumps_ = (ents[0] & ~CACHE_WRAPPER_BIT); 160 assert_gt(jumps_, 0); 161 assert_leq(jumps_, ebwt_->_eh._len); 162 // Get the target entry 163 TIndexOffU *dest = pool.get(ents[1]); 164 // Get the length from the target entry 165 len_ = dest[0]; 166 assert_leq(top_ + len_, ebwt_->_eh._len); 167 assert_gt(len_, 0); 168 assert_leq(len_, ebwt_->_eh._len); 169 // Get the pointer to the entries themselves 170 ents_ = dest + 1; 171 } else { 172 // Not a wrapper, so there are no jumps 173 jumps_ = 0; 174 // Get the length from the target entry 175 len_ = ents[0]; 176 assert_leq(top_ + len_, ebwt_->_eh._len); 177 assert_gt(len_, 0); 178 assert_leq(len_, ebwt_->_eh._len); 179 // Get the pointer to the entries themselves 180 ents_ = ents + 1; 181 } 182 if (sanity_) 183 assert(sanityCheckEnts()); 184 } 185 186 /** 187 * Initialize a wrapper with given number of jumps and given target 188 * entry index. 189 */ init(RangeCacheMemPool & pool,TIndexOffU top,TIndexOffU jumps,TIndexOffU ent,Ebwt * ebwt)190 void init(RangeCacheMemPool& pool, TIndexOffU top, TIndexOffU jumps, 191 TIndexOffU ent, Ebwt* ebwt) 192 { 193 assert(ebwt != NULL); 194 ebwt_ = ebwt; 195 top_ = top; 196 jumps_ = jumps; 197 TIndexOffU *ents = pool.get(ent); 198 // Must not be a wrapper 199 assert_eq(0, ents[0] & CACHE_WRAPPER_BIT); 200 // Get the length from the target entry 201 len_ = ents[0]; 202 assert_gt(len_, 0); 203 assert_leq(len_, ebwt_->_eh._len); 204 // Get the pointer to the entries themselves 205 ents_ = ents + 1; 206 assert_leq(top_ + len_, ebwt_->_eh._len); 207 if (sanity_) 208 assert(sanityCheckEnts()); 209 } 210 len()211 TIndexOffU len() const { 212 assert(ents_ != NULL); 213 assert(ebwt_ != NULL); 214 return len_; 215 } 216 jumps()217 TIndexOffU jumps() const { 218 assert(ents_ != NULL); 219 assert(ebwt_ != NULL); 220 return jumps_; 221 } 222 223 /** 224 * 225 */ reset()226 void reset() { 227 ents_ = NULL; 228 } 229 230 /** 231 * Return true iff this object represents a valid cache entry. 232 */ valid()233 bool valid() const { 234 return ents_ != NULL; 235 } 236 ebwt()237 Ebwt *ebwt() { 238 return ebwt_; 239 } 240 241 /** 242 * Install a result obtained by a client of this cache; be sure to 243 * adjust for how many jumps down the tunnel the cache entry is 244 * situated. 245 */ install(TIndexOffU elt,TIndexOffU val)246 void install(TIndexOffU elt, TIndexOffU val) { 247 if(ents_ == NULL) { 248 // This is not a valid cache entry; do nothing 249 return; 250 } 251 assert(ents_ != NULL); 252 assert(ebwt_ != NULL); 253 assert_leq(jumps_, val); 254 assert_neq(OFF_MASK, val); 255 assert_leq(top_ + len_, ebwt_->_eh._len); 256 if(elt < len_) { 257 val -= jumps_; 258 if(verbose_) cout << "Installed reference offset: " << (top_ + elt) << endl; 259 ASSERT_ONLY(TIndexOffU sanity = RowChaser::toFlatRefOff(ebwt_, 1, top_ + elt)); 260 assert_eq(sanity, val); 261 #ifndef NDEBUG 262 for(size_t i = 0; i < len_; i++) { 263 if(i == elt) continue; 264 assert_neq(val, ents_[i]); 265 } 266 #endif 267 ents_[elt] = val; 268 } else { 269 // ignore install request 270 if(verbose_) cout << "Fell off end of cache entry for install: " << (top_ + elt) << endl; 271 } 272 } 273 274 /** 275 * Get an element from the cache, adjusted for tunnel jumps. 276 */ get(TIndexOffU elt)277 inline TIndexOffU get(TIndexOffU elt) const { 278 if(ents_ == NULL) { 279 // This is not a valid cache entry; do nothing 280 return RANGE_NOT_SET; 281 } 282 assert(ents_ != NULL); 283 assert(ebwt_ != NULL); 284 assert_leq(top_ + len_, ebwt_->_eh._len); 285 if(elt < len_ && ents_[elt] != RANGE_NOT_SET) { 286 if(verbose_) cout << "Retrieved result from cache: " << (top_ + elt) << endl; 287 TIndexOffU ret = ents_[elt] + jumps_; 288 ASSERT_ONLY(TIndexOffU sanity = RowChaser::toFlatRefOff(ebwt_, 1, top_ + elt)); 289 assert_eq(sanity, ret); 290 return ret; 291 } else { 292 if(verbose_) cout << "Cache entry not set: " << (top_ + elt) << endl; 293 return RANGE_NOT_SET; 294 } 295 } 296 297 /** 298 * Check that len_ and the ents_ array both make sense. 299 */ sanityCheckEnts(TIndexOffU len,TIndexOffU * ents,Ebwt * ebwt)300 static bool sanityCheckEnts(TIndexOffU len, TIndexOffU *ents, Ebwt* ebwt) { 301 assert_gt(len, 0); 302 assert_leq(len, ebwt->_eh._len); 303 if(len < 10) { 304 for(size_t i = 0; i < len; i++) { 305 if(ents[i] == OFF_MASK) continue; 306 assert_leq(ents[i], ebwt->_eh._len); 307 for(size_t j = i+1; j < len; j++) { 308 if(ents[j] == OFF_MASK) continue; 309 assert_neq(ents[i], ents[j]); 310 } 311 } 312 } else { 313 std::set<TIndexOffU> seen; 314 for(size_t i = 0; i < len; i++) { 315 if(ents[i] == OFF_MASK) continue; 316 assert(seen.find(ents[i]) == seen.end()); 317 seen.insert(ents[i]); 318 } 319 } 320 return true; 321 } 322 323 /** 324 * Check that len_ and the ents_ array both make sense. 325 */ sanityCheckEnts()326 bool sanityCheckEnts() { 327 return RangeCacheEntry::sanityCheckEnts(len_, ents_, ebwt_); 328 } 329 330 private: 331 332 TIndexOffU top_; /// top pointer for this range 333 TIndexOffU jumps_; /// how many tunnel-jumps it is away from the requester 334 TIndexOffU len_; /// # of entries in cache entry 335 TIndexOffU *ents_; /// ptr to entries, which are flat offs within joined ref 336 Ebwt *ebwt_; /// index that alignments are in 337 bool verbose_; /// be talkative? 338 bool sanity_; /// do consistency checks? 339 }; 340 341 /** 342 * 343 */ 344 class RangeCache { 345 typedef EList<TIndexOffU> TUVec; 346 typedef std::map<TIndexOffU, TIndexOffU> TMap; 347 typedef std::map<TIndexOffU, TIndexOffU>::iterator TMapItr; 348 349 public: RangeCache(TIndexOffU lim,Ebwt * ebwt)350 RangeCache(TIndexOffU lim, Ebwt* ebwt) : 351 lim_(lim), map_(), pool_(lim), closed_(false), ebwt_(ebwt), sanity_(true) { } 352 353 /** 354 * Given top and bot offsets, retrieve the canonical cache entry 355 * that best covers that range. The cache entry may not directly 356 * correspond to the top offset provided, rather, it might be an 357 * entry that lies "at the end of the tunnel" when top and bot are 358 * walked backward. 359 */ lookup(TIndexOffU top,TIndexOffU bot,RangeCacheEntry & ent)360 bool lookup(TIndexOffU top, TIndexOffU bot, RangeCacheEntry& ent) { 361 if(ebwt_ == NULL || lim_ == 0) return false; 362 assert_gt(bot, top); 363 ent.reset(); 364 TMapItr itr = map_.find(top); 365 if(itr == map_.end()) { 366 // No cache entry for the given 'top' offset 367 if(closed_) { 368 return false; // failed to get cache entry 369 } else { 370 if(pool_.closed()) { 371 closed_ = true; 372 return false; // failed to get cache entry 373 } 374 } 375 // Use the tunnel 376 bool ret = tunnel(top, bot, ent); 377 return ret; 378 } else { 379 // There is a cache entry for the given 'top' offset 380 TIndexOffU ret = itr->second; 381 ent.init(pool_, top, ret, ebwt_); 382 return true; // success 383 } 384 } 385 386 /** 387 * Exhaustively check all entries linked to from map_ to ensure 388 * they're well-formed. 389 */ repOk()390 bool repOk() { 391 #ifndef NDEBUG 392 for(TMapItr itr = map_.begin(); itr != map_.end(); itr++) { 393 TIndexOffU top = itr->first; 394 TIndexOffU idx = itr->second; 395 TIndexOffU jumps = 0; 396 assert_leq(top, ebwt_->_eh._len); 397 TIndexOffU *ents = pool_.get(idx); 398 if((ents[0] & CACHE_WRAPPER_BIT) != 0) { 399 jumps = ents[0] & ~CACHE_WRAPPER_BIT; 400 assert_leq(jumps, ebwt_->_eh._len); 401 idx = ents[1]; 402 ents = pool_.get(idx); 403 } 404 TIndexOffU len = ents[0]; 405 assert_leq(top + len, ebwt_->_eh._len); 406 if (sanity_) 407 RangeCacheEntry::sanityCheckEnts(len, ents + 1, ebwt_); 408 } 409 #endif 410 return true; 411 } 412 413 protected: 414 415 /** 416 * Tunnel through to the first range that 1) includes all the same 417 * suffixes (though longer) as the given range, and 2) has a cache 418 * entry for it. 419 */ tunnel(TIndexOffU top,TIndexOffU bot,RangeCacheEntry & ent)420 bool tunnel(TIndexOffU top, TIndexOffU bot, RangeCacheEntry& ent) { 421 assert_gt(bot, top); 422 TUVec tops; 423 const TIndexOffU spread = bot - top; 424 SideLocus tloc, bloc; 425 SideLocus::initFromTopBot(top, bot, ebwt_->_eh, ebwt_->_ebwt, tloc, bloc); 426 TIndexOffU newtop = top, newbot = bot; 427 TIndexOffU jumps = 0; 428 // Walk left through the tunnel 429 while(true) { 430 if(ebwt_->rowL(tloc) != ebwt_->rowL(bloc)) { 431 // Different characters at top and bot positions of 432 // BWT; this means that the calls to mapLF below are 433 // guaranteed to yield rows in two different character- 434 // sections of the BWT. 435 break; 436 } 437 // Advance top and bot 438 newtop = ebwt_->mapLF(tloc); 439 newbot = ebwt_->mapLF(bloc); 440 assert_geq(newbot, newtop); 441 assert_leq(newbot - newtop, spread); 442 // If the new spread is the same as the old spread, we can 443 // be confident that the new range includes all of the same 444 // suffixes as the last range (though longer by 1 char) 445 if((newbot - newtop) == spread) { 446 // Check if newtop is already cached 447 TMapItr itr = map_.find(newtop); 448 jumps++; 449 if(itr != map_.end()) { 450 // This range, which is further to the left in the 451 // same tunnel as the query range, has a cache 452 // entry already, so use that 453 TIndexOffU idx = itr->second; 454 TIndexOffU *ents = pool_.get(idx); 455 if((ents[0] & CACHE_WRAPPER_BIT) != 0) { 456 // The cache entry we found was a wrapper; make 457 // a new wrapper that points to that wrapper's 458 // target, with the appropriate number of jumps 459 jumps += (ents[0] & ~CACHE_WRAPPER_BIT); 460 idx = ents[1]; 461 } 462 // Allocate a new wrapper 463 TIndexOffU newentIdx = pool_.alloc(2); 464 if(newentIdx != RANGE_CACHE_BAD_ALLOC) { 465 // We successfully made a new wrapper entry; 466 // now populate it and install it in map_ 467 TIndexOffU *newent = pool_.get(newentIdx); // get ptr to it 468 assert_eq(0, newent[0]); 469 newent[0] = CACHE_WRAPPER_BIT | jumps; // set jumps 470 newent[1] = idx; // set target 471 assert(map_.find(top) == map_.end()); 472 map_[top] = newentIdx; 473 if(sanity_) assert(repOk()); 474 } 475 // Initialize the entry 476 ent.init(pool_, top, jumps, idx, ebwt_); 477 return true; 478 } 479 // Save this range 480 tops.push_back(newtop); 481 SideLocus::initFromTopBot(newtop, newbot, ebwt_->_eh, ebwt_->_ebwt, tloc, bloc); 482 } else { 483 // Not all the suffixes were preserved, so we can't 484 // link the source range's cached result to this 485 // range's cached results 486 break; 487 } 488 assert_eq(jumps, tops.size()); 489 } 490 assert_eq(jumps, tops.size()); 491 // Try to create a new cache entry for the leftmost range in 492 // the tunnel (which might be the query range) 493 TIndexOffU newentIdx = pool_.alloc(spread + 1); 494 if(newentIdx != RANGE_CACHE_BAD_ALLOC) { 495 // Successfully allocated new range cache entry; install it 496 TIndexOffU *newent = pool_.get(newentIdx); 497 assert_eq(0, newent[0]); 498 // Store cache-range length in first word 499 newent[0] = spread; 500 assert_lt(newent[0], CACHE_WRAPPER_BIT); 501 assert_eq(spread, newent[0]); 502 TIndexOffU entTop = top; 503 TIndexOffU jumps = 0; 504 if(tops.size() > 0) { 505 entTop = tops.back(); 506 jumps = (TIndexOff)tops.size(); 507 } 508 // Cache the entry for the end of the tunnel 509 assert(map_.find(entTop) == map_.end()); 510 map_[entTop] = newentIdx; 511 if(sanity_) assert(repOk()); 512 ent.init(pool_, entTop, jumps, newentIdx, ebwt_); 513 assert_eq(spread, newent[0]); 514 if(jumps > 0) { 515 assert_neq(entTop, top); 516 // Cache a wrapper entry for the query range (if possible) 517 TIndexOffU wrapentIdx = pool_.alloc(2); 518 if(wrapentIdx != RANGE_CACHE_BAD_ALLOC) { 519 TIndexOffU *wrapent = pool_.get(wrapentIdx); 520 assert_eq(0, wrapent[0]); 521 wrapent[0] = CACHE_WRAPPER_BIT | jumps; 522 wrapent[1] = newentIdx; 523 assert(map_.find(top) == map_.end()); 524 map_[top] = wrapentIdx; 525 if(sanity_) assert(repOk()); 526 } 527 } 528 return true; 529 } else { 530 // Could not allocate new range cache entry 531 return false; 532 } 533 } 534 535 TIndexOffU lim_; /// Total number of key/val bytes to keep in cache 536 TMap map_; /// 537 RangeCacheMemPool pool_; /// Memory pool 538 bool closed_; /// Out of space; no new entries 539 Ebwt* ebwt_; /// Index that alignments are in 540 bool sanity_; 541 }; 542 543 #endif /* RANGE_CACHE_H_ */ 544