1 // File Description
2 /// \file CompositeBamReader.h
3 /// \brief Defines the composite BAM readers, for working with multiple input
4 ///       files.
5 //
6 // Author: Derek Barnett
7 
8 #ifndef COMPOSITEBAMREADER_H
9 #define COMPOSITEBAMREADER_H
10 
11 #include <deque>
12 #include <functional>
13 #include <memory>
14 #include <string>
15 #include <vector>
16 #include "pbbam/BaiIndexedBamReader.h"
17 #include "pbbam/BamFile.h"
18 #include "pbbam/BamHeader.h"
19 #include "pbbam/BamReader.h"
20 #include "pbbam/BamRecord.h"
21 #include "pbbam/Config.h"
22 #include "pbbam/DataSet.h"
23 #include "pbbam/GenomicInterval.h"
24 #include "pbbam/PbiIndexedBamReader.h"
25 
26 namespace PacBio {
27 namespace BAM {
28 
29 namespace internal {
30 
31 /// \internal
32 /// \brief The CompositeMergeItem class provides a helper struct for composite
33 ///        readers, containing a single-file reader and its "next" record.
34 ///
35 struct CompositeMergeItem
36 {
37 public:
38     std::unique_ptr<BamReader> reader;
39     BamRecord record;
40 
41 public:
42     CompositeMergeItem(std::unique_ptr<BamReader> rdr);
43     CompositeMergeItem(std::unique_ptr<BamReader> rdr, BamRecord rec);
44     CompositeMergeItem(CompositeMergeItem&&) = default;
45     CompositeMergeItem& operator=(CompositeMergeItem&&) = default;
46     ~CompositeMergeItem() = default;
47 };
48 
49 /// \internal
50 /// \brief The CompositeMergeItemSorter class provides a helper function object
51 ///        for ordering composite reader results.
52 ///
53 /// Essentially just exracts a BamRecord from its parent CompositeMergeItem for
54 /// further checks.
55 ///
56 template <typename CompareType>
57 struct CompositeMergeItemSorter
58     : public std::function<bool(const CompositeMergeItem&, const CompositeMergeItem&)>
59 {
60     bool operator()(const CompositeMergeItem& lhs, const CompositeMergeItem& rhs);
61 };
62 
63 }  // namespace internal
64 
65 /// \brief The GenomicIntervalCompositeBamReader class provides read access to
66 ///        multipe %BAM files, limiting results to a genomic region.
67 ///
68 /// Requires a ".bai" file for each input %BAM file.
69 ///
70 /// Results will be returned in order of genomic coordinate (first by reference
71 /// ID, then by position).
72 ///
73 class PBBAM_EXPORT GenomicIntervalCompositeBamReader
74 {
75 public:
76     /// \name Contstructors & Related Methods
77     /// \{
78 
79     GenomicIntervalCompositeBamReader(const GenomicInterval& interval,
80                                       const std::vector<BamFile>& bamFiles);
81     GenomicIntervalCompositeBamReader(const GenomicInterval& interval, const DataSet& dataset);
82 
83     /// \}
84 
85 public:
86     /// \name Data Access
87     /// \{
88 
89     /// Fetches next BAM record in the interval specified, storing in \p record
90     ///
91     /// \param[out] record
92     /// \returns true on success, false if no more data available.
93     ///
94     bool GetNext(BamRecord& record);
95 
96     /// Sets a new genomic interval of interest.
97     ///
98     /// \returns reference to this reader
99     ///
100     GenomicIntervalCompositeBamReader& Interval(const GenomicInterval& interval);
101 
102     /// \returns the current specified interval
103     ///
104     const GenomicInterval& Interval() const;
105 
106     /// \}
107 
108 private:
109     void UpdateSort();
110 
111 private:
112     GenomicInterval interval_;
113     std::deque<internal::CompositeMergeItem> mergeItems_;
114     std::vector<std::string> filenames_;
115 };
116 
117 /// \brief Provides read access to multipe %BAM files, limiting results to those
118 ///        passing a PbiFilter.
119 ///
120 /// Requires a ".pbi" file for each input %BAM file.
121 ///
122 /// \note The template parameter OrderByType is not fully implemented at this
123 ///       time. Use of comparison functor (e.g. Compare::Zmw) for this will
124 ///       currently result in the proper "next" value <b> at each iteration
125 ///       step, independently, but not over the full data set. </b> If all
126 ///       files' "order-by" data values are accessible in increasing order
127 ///       within each file, then the expected ordering will be observed,
128 ///       However, if these data are not sorted within a file, the final results
129 ///       will appear unordered. \n
130 ///       \n
131 ///           Example:\n
132 ///           file 1: { 1, 5, 2, 6 } \n
133 ///           file 2: { 3, 8, 4, 7 } \n
134 ///           results: { 1, 3, 5, 2, 6, 8, 4, 7 } \n
135 ///       \n
136 ///       This a known issue and will be addressed in a future update. But in
137 ///       the meantime, use of Compare::None as the OrderByType is recommended,
138 ///       to explicitly indicate that no particular ordering is expected.
139 ///
140 template <typename OrderByType>
141 class PBBAM_EXPORT PbiFilterCompositeBamReader
142 {
143 public:
144     using value_type = internal::CompositeMergeItem;
145     using merge_sorter_type = internal::CompositeMergeItemSorter<OrderByType>;
146     using container_type = std::deque<value_type>;
147     using iterator = typename container_type::iterator;
148     using const_iterator = typename container_type::const_iterator;
149 
150 public:
151     /// \name Contstructors & Related Methods
152     /// \{
153 
154     PbiFilterCompositeBamReader(const PbiFilter& filter, const std::vector<BamFile>& bamFiles);
155     PbiFilterCompositeBamReader(const PbiFilter& filter, const DataSet& dataset);
156 
157     /// \}
158 
159 public:
160     /// \name Data Access
161     /// \{
162 
163     /// Fetches next BAM record in the interval specified.
164     ///
165     /// \returns true on success, false if no more data available.
166     ///
167     bool GetNext(BamRecord& record);
168 
169     /// Sets a new PBI filter
170     ///
171     /// \returns reference to this reader
172     ///
173     PbiFilterCompositeBamReader& Filter(const PbiFilter& filter);
174 
175     uint32_t NumReads() const;
176 
177     /// \}
178 
179 private:
180     void UpdateSort();
181 
182 private:
183     container_type mergeQueue_;
184     std::vector<std::string> filenames_;
185     uint32_t numReads_;
186 };
187 
188 /// \brief The SequentialCompositeBamReader class provides read access to
189 ///        multiple %BAM files, reading through the entire contents of each
190 ///        file.
191 ///
192 /// Input files will be accessed in the order provided to the constructor. Each
193 /// file's contents will be exhausted before moving on to the next one (as
194 /// opposed to a "round-robin" scheme).
195 ///
196 class PBBAM_EXPORT SequentialCompositeBamReader
197 {
198 public:
199     /// \name Contstructors & Related Methods
200     /// \{
201 
202     SequentialCompositeBamReader(std::vector<BamFile> bamFiles);
203     SequentialCompositeBamReader(const DataSet& dataset);
204 
205     /// \}
206 
207 public:
208     /// \name Data Access
209     /// \{
210 
211     /// Fetches next BAM record from the .
212     ///
213     /// \returns true on success, false if no more data available.
214     ///
215     bool GetNext(BamRecord& record);
216 
217     /// \}
218 
219 private:
220     std::deque<std::unique_ptr<BamReader> > readers_;
221 };
222 
223 }  // namespace BAM
224 }  // namespace PacBio
225 
226 #include "pbbam/internal/CompositeBamReader.inl"
227 
228 #endif  // COMPOSITEBAMREADER_H
229