1 // Author: Derek Barnett 2 3 #ifndef PULSE2BASECACHE_H 4 #define PULSE2BASECACHE_H 5 6 #include <cassert> 7 #include <cctype> 8 #include <cstddef> 9 #include <string> 10 11 #include <boost/dynamic_bitset.hpp> 12 13 #include "pbbam/Config.h" 14 15 namespace PacBio { 16 namespace BAM { 17 namespace internal { 18 19 class Pulse2BaseCache 20 { 21 public: 22 /// \brief Creates a Pulse2BaseCache from pulseCall data ('pc' tag) 23 /// 24 /// Computes & stores cache of basecalled vs. squashed pulse positions for 25 /// later masking of pulse data. 26 /// 27 /// \param pulseCalls[in] string contents of 'pc' tag 28 /// Pulse2BaseCache(const std::string & pulseCalls)29 Pulse2BaseCache(const std::string& pulseCalls) : data_(pulseCalls.size()) 30 { 31 // basecalled pulse -> data[i] == 1 32 // squashed pulse -> data[i] == 0 33 // 34 const auto numPulses = pulseCalls.size(); 35 for (size_t i = 0; i < numPulses; ++i) 36 data_[i] = std::isupper(pulseCalls.at(i)); 37 } 38 39 Pulse2BaseCache() = delete; 40 Pulse2BaseCache(const Pulse2BaseCache&) = default; 41 Pulse2BaseCache(Pulse2BaseCache&&) = default; 42 Pulse2BaseCache& operator=(const Pulse2BaseCache&) = default; 43 Pulse2BaseCache& operator=(Pulse2BaseCache&&) = default; 44 ~Pulse2BaseCache() = default; 45 46 public: 47 /// 48 /// \brief FindFirst 49 /// \return 50 /// FindFirst()51 size_t FindFirst() const { return data_.find_first(); } 52 53 /// 54 /// \brief FindNext 55 /// \param from 56 /// \return 57 /// FindNext(size_t from)58 size_t FindNext(size_t from) const { return data_.find_next(from); } 59 60 /// 61 /// \brief IsBasecallAt 62 /// \param pos 63 /// \return 64 /// IsBasecallAt(const size_t pos)65 bool IsBasecallAt(const size_t pos) const { return data_[pos]; } 66 67 /// \returns the total number of pulses (basecalled & squashed) 68 /// NumPulses()69 size_t NumPulses() const { return data_.size(); } 70 71 /// \returns the total number of basecalled pulses 72 /// NumBases()73 size_t NumBases() const { return data_.count(); } 74 75 /// \brief Removes squashed pulse positions from input data. 76 /// 77 /// \param[in] Contents of any per-pulse tag. 78 /// \returns Input \p pulseData less all squashed pulses 79 /// 80 template <typename T> RemoveSquashedPulses(const T & pulseData)81 T RemoveSquashedPulses(const T& pulseData) const 82 { 83 const auto numPulses = pulseData.size(); 84 assert(numPulses == data_.size()); 85 86 // The reserve() below overshoots the required space, but numPulses is cheap 87 // to compute, and by definition will be sufficient to hold the result. Thus 88 // we only ever need to do one allocation. 89 // 90 T result; 91 result.reserve(numPulses); 92 93 // Only include data at positions that match our cached pulse data. 94 // 95 size_t inputIndex = 0; 96 for (size_t i = 0; i < numPulses; ++i) { 97 if (data_[i]) result.push_back(pulseData.at(inputIndex)); 98 ++inputIndex; 99 } 100 return result; 101 } 102 103 private: 104 boost::dynamic_bitset<> data_; 105 }; 106 107 } // namespace internal 108 } // namespace BAM 109 } // namespace PacBio 110 111 #endif // PULSE2BASECACHE_H 112