1 // Author: Derek Barnett
2 
3 #ifndef PULSE2BASECACHE_H
4 #define PULSE2BASECACHE_H
5 
6 #include <cassert>
7 #include <cctype>
8 #include <cstddef>
9 #include <string>
10 
11 #include <boost/dynamic_bitset.hpp>
12 
13 #include "pbbam/Config.h"
14 
15 namespace PacBio {
16 namespace BAM {
17 namespace internal {
18 
19 class Pulse2BaseCache
20 {
21 public:
22     /// \brief Creates a Pulse2BaseCache from pulseCall data ('pc' tag)
23     ///
24     /// Computes & stores cache of basecalled vs. squashed pulse positions for
25     /// later masking of pulse data.
26     ///
27     /// \param pulseCalls[in]   string contents of 'pc' tag
28     ///
Pulse2BaseCache(const std::string & pulseCalls)29     Pulse2BaseCache(const std::string& pulseCalls) : data_(pulseCalls.size())
30     {
31         // basecalled pulse -> data[i] == 1
32         // squashed pulse   -> data[i] == 0
33         //
34         const auto numPulses = pulseCalls.size();
35         for (size_t i = 0; i < numPulses; ++i)
36             data_[i] = std::isupper(pulseCalls.at(i));
37     }
38 
39     Pulse2BaseCache() = delete;
40     Pulse2BaseCache(const Pulse2BaseCache&) = default;
41     Pulse2BaseCache(Pulse2BaseCache&&) = default;
42     Pulse2BaseCache& operator=(const Pulse2BaseCache&) = default;
43     Pulse2BaseCache& operator=(Pulse2BaseCache&&) = default;
44     ~Pulse2BaseCache() = default;
45 
46 public:
47     ///
48     /// \brief FindFirst
49     /// \return
50     ///
FindFirst()51     size_t FindFirst() const { return data_.find_first(); }
52 
53     ///
54     /// \brief FindNext
55     /// \param from
56     /// \return
57     ///
FindNext(size_t from)58     size_t FindNext(size_t from) const { return data_.find_next(from); }
59 
60     ///
61     /// \brief IsBasecallAt
62     /// \param pos
63     /// \return
64     ///
IsBasecallAt(const size_t pos)65     bool IsBasecallAt(const size_t pos) const { return data_[pos]; }
66 
67     /// \returns the total number of pulses (basecalled & squashed)
68     ///
NumPulses()69     size_t NumPulses() const { return data_.size(); }
70 
71     /// \returns the total number of basecalled pulses
72     ///
NumBases()73     size_t NumBases() const { return data_.count(); }
74 
75     /// \brief Removes squashed pulse positions from input data.
76     ///
77     /// \param[in]  Contents of any per-pulse tag.
78     /// \returns    Input \p pulseData less all squashed pulses
79     ///
80     template <typename T>
RemoveSquashedPulses(const T & pulseData)81     T RemoveSquashedPulses(const T& pulseData) const
82     {
83         const auto numPulses = pulseData.size();
84         assert(numPulses == data_.size());
85 
86         // The reserve() below overshoots the required space, but numPulses is cheap
87         // to compute, and by definition will be sufficient to hold the result. Thus
88         // we only ever need to do one allocation.
89         //
90         T result;
91         result.reserve(numPulses);
92 
93         // Only include data at positions that match our cached pulse data.
94         //
95         size_t inputIndex = 0;
96         for (size_t i = 0; i < numPulses; ++i) {
97             if (data_[i]) result.push_back(pulseData.at(inputIndex));
98             ++inputIndex;
99         }
100         return result;
101     }
102 
103 private:
104     boost::dynamic_bitset<> data_;
105 };
106 
107 }  // namespace internal
108 }  // namespace BAM
109 }  // namespace PacBio
110 
111 #endif  // PULSE2BASECACHE_H
112