1 // File Description
2 /// \file BamHeader.h
3 /// \brief Defines the BamHeader class.
4 //
5 // Author: Derek Barnett
6 
7 #ifndef BAMHEADER_H
8 #define BAMHEADER_H
9 
10 #include <cstddef>
11 #include <cstdint>
12 #include <memory>
13 #include <stdexcept>
14 #include <string>
15 #include <vector>
16 
17 #include "pbbam/Config.h"
18 #include "pbbam/ProgramInfo.h"
19 #include "pbbam/ReadGroupInfo.h"
20 #include "pbbam/SequenceInfo.h"
21 
22 namespace PacBio {
23 namespace BAM {
24 
25 namespace internal {
26 class BamHeaderPrivate;
27 }
28 
29 /// \brief The BamHeader class represents the header section of the %BAM file.
30 ///
31 /// It provides metadata about the file including file version, reference
32 /// sequences, read groups, comments, etc.
33 ///
34 /// A BamHeader may be fetched from a BamFile to view an existing file's
35 /// metadata. Or one may be created/edited for use with writing to a new file
36 /// (via BamWriter).
37 ///
38 /// \note A particular BamHeader is likely to be re-used in lots of places
39 ///       throughout the library, for read-only purposes. For this reason, even
40 ///       though a BamHeader may be returned by value, it is essentially a thin
41 ///       wrapper for a shared-pointer to the actual data. This means, though,
42 ///       that if you need to edit an existing BamHeader for use with a
43 ///       BamWriter, please consider using BamHeader::DeepCopy. Otherwise any
44 ///       modifications will affect all BamHeaders that are sharing its
45 ///       underlying data.
46 ///
47 class PBBAM_EXPORT BamHeader
48 {
49 public:
50     /// \name Constructors & Related Methods
51     /// \{
52 
53     ///
54     /// \brief Creates a BamHeader from SAM-formatted text
55     /// \param samHeaderText
56     ///
57     BamHeader(const std::string& samHeaderText);
58 
59     BamHeader();
60     BamHeader(const BamHeader&) = default;
61     BamHeader(BamHeader&&) = default;
62     BamHeader& operator=(const BamHeader&) = default;
63     BamHeader& operator=(BamHeader&&) = default;
64     ~BamHeader() = default;
65 
66     /// \brief Detaches underlying data from the shared-pointer, returning a
67     ///        independent copy of the header contents.
68     ///
69     /// This ensures that any modifications to the newly returned BamHeader do
70     /// not affect other BamHeader objects that were sharing its underlying data.
71     ///
72     BamHeader DeepCopy() const;
73 
74     /// \}
75 
76 public:
77     /// \name Operators
78     /// \{
79 
80     /// \brief Merges another header with this one.
81     ///
82     /// Headers must be compatible for merging. This means that their Version,
83     /// SortOrder, PacBioBamVersion (and in the case of aligned BAM data,
84     /// Sequences) must all match. If not, an exception will be thrown.
85     ///
86     /// \param[in] other  header to merge with this one
87     /// \returns reference to this header
88     ///
89     /// \throws std::runtime_error if the headers are not compatible
90     ///
91     BamHeader& operator+=(const BamHeader& other);
92 
93     /// \brief Creates a new, merged header.
94     ///
95     /// Headers must be compatible for merging. This means that their Version,
96     /// SortOrder, PacBioBamVersion (and in the case of aligned BAM data,
97     /// Sequences) must all match. If not, an exception will be thrown.
98     ///
99     /// Both original headers (this header and \p other) will not be modified.
100     ///
101     /// \param[in] other  header to merge with this one
102     /// \returns merged header
103     ///
104     /// \throws std::runtime_error if the headers are not compatible
105     ///
106     BamHeader operator+(const BamHeader& other) const;
107 
108     /// \}
109 
110 public:
111     /// \name General Attributes
112     /// \{
113 
114     /// \returns the %PacBio %BAM version number (\@HD:pb)
115     ///
116     /// \note This is different from the SAM/BAM version number
117     /// \sa BamHeader::Version.
118     ///
119     std::string PacBioBamVersion() const;
120 
121     /// \returns the sort order used
122     ///
123     /// Valid values: "unknown", "unsorted", "queryname", or "coordinate"
124     ///
125     std::string SortOrder() const;
126 
127     /// \returns the SAM/BAM version number (\@HD:VN)
128     ///
129     /// \note This is different from the %PacBio %BAM version number
130     /// \sa BamHeader::PacBioBamVersion
131     ///
132     std::string Version() const;
133 
134     /// \}
135 
136 public:
137     /// \name Read Groups
138     /// \{
139 
140     /// \returns true if the header contains a read group with \p id (\@RG:ID)
141     bool HasReadGroup(const std::string& id) const;
142 
143     /// \returns a ReadGroupInfo object representing the read group matching
144     ///          \p id (\@RG:ID)
145     /// \throws std::runtime_error if \p id is unknown
146     ///
147     ReadGroupInfo ReadGroup(const std::string& id) const;
148 
149     /// \returns vector of read group IDs listed in this header
150     std::vector<std::string> ReadGroupIds() const;
151 
152     /// \returns vector of ReadGroupInfo objects, representing all read groups
153     ///          listed in this header
154     ///
155     std::vector<ReadGroupInfo> ReadGroups() const;
156 
157     /// \}
158 
159 public:
160     /// \name Sequences
161     /// \{
162 
163     /// \returns true if header contains a sequence with \p name (\@SQ:SN)
164     bool HasSequence(const std::string& name) const;
165 
166     /// \returns number of sequences (\@SQ entries) stored in this header
167     size_t NumSequences() const;
168 
169     /// \returns numeric ID for sequence matching \p name (\@SQ:SN)
170     ///
171     /// This is the numeric ID used elsewhere throughout the API.
172     ///
173     /// \throws std::runtime_error if \p name is unknown
174     /// \sa BamReader::ReferenceId, PbiReferenceIdFilter,
175     ///     PbiRawMappedData::tId_
176     ///
177     int32_t SequenceId(const std::string& name) const;
178 
179     /// \returns the length of the sequence (\@SQ:LN, e.g. chromosome length) at
180     ///          index \p id
181     ///
182     /// \sa SequenceInfo::Length, BamHeader::SequenceId
183     ///
184     std::string SequenceLength(const int32_t id) const;
185 
186     /// \returns the name of the sequence (\@SQ:SN) at index \p id
187     ///
188     /// \sa SequenceInfo::Name, BamHeader::SequenceId
189     ///
190     std::string SequenceName(const int32_t id) const;
191 
192     /// \returns vector of sequence names (\@SQ:SN) stored in this header
193     ///
194     /// Position in the vector is equivalent to SequenceId.
195     ///
196     std::vector<std::string> SequenceNames() const;
197 
198     /// \returns SequenceInfo object at index \p id
199     ///
200     /// \throws std::out_of_range if \p is an invalid or unknown index
201     /// \sa BamHeader::SequenceId
202     ///
203     SequenceInfo Sequence(const int32_t id) const;
204 
205     /// \returns SequenceInfo for the sequence matching \p name
206     SequenceInfo Sequence(const std::string& name) const;
207 
208     /// \returns vector of SequenceInfo objects representing the sequences
209     ///          (\@SQ entries) stored in this header
210     ///
211     std::vector<SequenceInfo> Sequences() const;
212 
213     /// \}
214 
215 public:
216     /// \name Programs
217     /// \{
218 
219     /// \returns true if this header contains a program entry with ID (\@PG:ID)
220     ///          matching \p id
221     ///
222     bool HasProgram(const std::string& id) const;
223 
224     /// \returns ProgramInfo object for the program entry matching \p id
225     /// \throws std::runtime_error if \p id is unknown
226     ///
227     ProgramInfo Program(const std::string& id) const;
228 
229     /// \returns vector of program IDs (\@PG:ID)
230     std::vector<std::string> ProgramIds() const;
231 
232     /// \returns vector of ProgramInfo objects representing program entries
233     ///          (\@PG) stored in this heder
234     ///
235     std::vector<ProgramInfo> Programs() const;
236 
237     /// \}
238 
239 public:
240     /// \name Comments
241     /// \{
242 
243     /// \returns vector of comment (\@CO) strings
244     std::vector<std::string> Comments() const;
245 
246     /// \}
247 
248 public:
249     /// \name Conversion Methods
250     /// \{
251 
252     /// \returns SAM-header-formatted string representing this header's data
253     std::string ToSam() const;
254 
255     /// \}
256 
257 public:
258     /// \name General Attributes
259     /// \{
260 
261     /// \brief Sets this header's PacBioBAM version number (\@HD:pb).
262     ///
263     /// \returns reference to this object
264     /// \throws std::runtime_error if version number cannot be parsed or
265     ///         is less than the minimum version allowed.
266     ///
267     BamHeader& PacBioBamVersion(const std::string& version);
268 
269     /// \brief Sets this header's sort order label (\@HD:SO).
270     ///
271     /// Valid values: "unknown", "unsorted", "queryname", or "coordinate"
272     ///
273     /// \returns reference to this object
274     ///
275     BamHeader& SortOrder(std::string order);
276 
277     /// \brief Sets this header's SAM/BAM version number (\@HD:VN).
278     ///
279     /// \returns reference to this object
280     ///
281     BamHeader& Version(std::string version);
282 
283     /// \}
284 
285 public:
286     /// \name Read Groups
287     /// \{
288 
289     /// \brief Appends a read group entry (\@RG) to this header.
290     ///
291     /// \returns reference to this object
292     ///
293     BamHeader& AddReadGroup(ReadGroupInfo readGroup);
294 
295     /// \brief Removes all read group entries from this header.
296     ///
297     /// \returns reference to this object
298     ///
299     BamHeader& ClearReadGroups();
300 
301     /// \brief Replaces this header's list of read group entries with those in
302     ///        \p readGroups.
303     ///
304     /// \returns reference to this object
305     ///
306     BamHeader& ReadGroups(std::vector<ReadGroupInfo> readGroups);
307 
308     /// \}
309 
310 public:
311     /// \name Sequences
312     /// \{
313 
314     /// \brief Appends a sequence entry (\@SQ) to this header.
315     ///
316     /// \returns reference to this object
317     ///
318     BamHeader& AddSequence(SequenceInfo sequence);
319 
320     /// \brief Removes all sequence entries from this header.
321     ///
322     /// \returns reference to this object
323     ///
324     BamHeader& ClearSequences();
325 
326     /// \brief Replaces this header's list of sequence entries with those in
327     ///       \p sequences.
328     ///
329     /// \returns reference to this object
330     ///
331     BamHeader& Sequences(std::vector<SequenceInfo> sequences);
332 
333     /// \}
334 
335 public:
336     /// \name Programs
337     /// \{
338 
339     /// \brief Appends a program entry (\@PG) to this header.
340     ///
341     /// \returns reference to this object
342     ///
343     BamHeader& AddProgram(ProgramInfo pg);
344 
345     /// \brief Removes all program entries from this header.
346     ///
347     /// \returns reference to this object
348     ///
349     BamHeader& ClearPrograms();
350 
351     /// \brief Replaces this header's list of program entries with those in
352     ///        \p programs.
353     ///
354     /// \returns reference to this object
355     ///
356     BamHeader& Programs(std::vector<ProgramInfo> programs);
357 
358     /// \}
359 
360 public:
361     /// \name Comments
362     /// \{
363 
364     /// \brief Appends a comment (\@CO) to this header.
365     ///
366     /// \returns reference to this object
367     ///
368     BamHeader& AddComment(std::string comment);
369 
370     /// \brief Removes all comments from this header.
371     ///
372     /// \returns reference to this object
373     ///
374     BamHeader& ClearComments();
375 
376     /// \brief Replaces this header's list of comments with those in \p comments.
377     ///
378     /// \returns reference to this object
379     ///
380     BamHeader& Comments(std::vector<std::string> comments);
381 
382     /// \}
383 
384 private:
385     std::shared_ptr<internal::BamHeaderPrivate> d_;
386 };
387 
388 }  // namespace BAM
389 }  // namespace PacBio
390 
391 #include "pbbam/internal/BamHeader.inl"
392 
393 #endif  // BAMHEADER_H
394