1 // ==========================================================================
2 //                 SeqAn - The Library for Sequence Analysis
3 // ==========================================================================
4 // Copyright (c) 2006-2018, Knut Reinert, FU Berlin
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are met:
9 //
10 //     * Redistributions of source code must retain the above copyright
11 //       notice, this list of conditions and the following disclaimer.
12 //     * Redistributions in binary form must reproduce the above copyright
13 //       notice, this list of conditions and the following disclaimer in the
14 //       documentation and/or other materials provided with the distribution.
15 //     * Neither the name of Knut Reinert or the FU Berlin nor the names of
16 //       its contributors may be used to endorse or promote products derived
17 //       from this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29 // DAMAGE.
30 //
31 // ==========================================================================
32 // Author: Manuel Holtgrewe <manuel.holtgrewe@fu-berlin.de>
33 // ==========================================================================
34 // Code for reading Bam.
35 // ==========================================================================
36 
37 #ifndef INCLUDE_SEQAN_BAM_IO_READ_BAM_H_
38 #define INCLUDE_SEQAN_BAM_IO_READ_BAM_H_
39 
40 namespace seqan {
41 
42 // ============================================================================
43 // Tags, Classes, Enums
44 // ============================================================================
45 
46 /*!
47  * @tag FileFormats#Bam
48  * @brief Identify the BAM format.
49  *
50  * @tag FileFormats#Sam
51  * @brief Identify the SAM format.
52  */
53 
54 struct Bam_;
55 typedef Tag<Bam_> Bam;
56 
57 
58 template <typename T>
59 struct FileExtensions<Bam, T>
60 {
61     static char const * VALUE[1];    // default is one extension
62 };
63 
64 template <typename T>
65 char const * FileExtensions<Bam, T>::VALUE[1] =
66 {
67     ".bam"     // default output extension
68 };
69 
70 
71 template <typename T>
72 struct MagicHeader<Bam, T>
73 {
74     static unsigned char const VALUE[4];
75 };
76 
77 template <typename T>
78 unsigned char const MagicHeader<Bam, T>::VALUE[4] = { 'B', 'A', 'M', '\1' };  // BAM's magic header
79 
80 // ============================================================================
81 // Metafunctions
82 // ============================================================================
83 
84 // ============================================================================
85 // Functions
86 // ============================================================================
87 
88 // ----------------------------------------------------------------------------
89 // Function readHeader()                                              BamHeader
90 // ----------------------------------------------------------------------------
91 
92 template <typename TForwardIter, typename TNameStore, typename TNameStoreCache, typename TStorageSpec>
93 inline void
94 readHeader(BamHeader & header,
95            BamIOContext<TNameStore, TNameStoreCache, TStorageSpec> & context,
96            TForwardIter & iter,
97            Bam const & /*tag*/)
98 {
99     clear(header);
100     // Read BAM magic string.
101     String<char, Array<4> > magic;
102     read(magic, iter, 4);
103     if (magic != "BAM\1")
104         SEQAN_THROW(ParseError("Not in BAM format."));
105 
106     // Read header text, including null padding.
107     int32_t lText;
108     readRawPod(lText, iter);
109 
110     CharString samHeader;
111     write(samHeader, iter, lText);
112 
113     // Truncate to first position of '\0'.
114     cropAfterFirst(samHeader, EqualsChar<'\0'>());
115 
116     // Parse out header records.
117     BamHeaderRecord headerRecord;
118     Iterator<CharString, Rooted>::Type it = begin(samHeader);
119     while (!atEnd(it))
120     {
121         clear(headerRecord);
122         readRecord(headerRecord, context, it, Sam());
123         appendValue(header, headerRecord);
124     }
125 
126     // Read # reference sequences.
127     int32_t nRef;
128     readRawPod(nRef, iter);
129     CharString name;
130 
131     clear(context.translateFile2GlobalRefId);
132     resize(context.translateFile2GlobalRefId, nRef, -1);
133 
134     for (int32_t i = 0; i < nRef; ++i)
135     {
136         // Read length of the reference name.
137         int32_t nName;
138         readRawPod(nName, iter);
139         clear(name);
140         write(name, iter, nName);
141         resize(name, nName - 1);
142         // Read length of the reference sequence.
143         int32_t lRef;
144         readRawPod(lRef, iter);
145 
146         // Add entry to name store and sequenceInfos if necessary.
147         // Compute translation from local ids (used in the BAM file) to corresponding ids in the name store
148         size_t globalRefId = nameToId(contigNamesCache(context), name);
149         context.translateFile2GlobalRefId[i] = globalRefId;
150         if (length(contigLengths(context)) <= globalRefId)
151             resize(contigLengths(context), globalRefId + 1, 0);
152         contigLengths(context)[globalRefId] = lRef;
153     }
154 }
155 
156 // ----------------------------------------------------------------------------
157 // Function readRecord()                                     BamAlignmentRecord
158 // ----------------------------------------------------------------------------
159 
160 template <typename TBuffer, typename TForwardIter>
161 inline int32_t
162 _readBamRecordWithoutSize(TBuffer & rawRecord, TForwardIter & iter)
163 {
164     int32_t recordLen = 0;
165     readRawPod(recordLen, iter);
166 
167     // fail, if we read "BAM\1" (did you miss to call readRecord(header, bamFile) first?)
168     if (recordLen == 0x014D4142)
169         SEQAN_THROW(ParseError("Unexpected BAM header encountered."));
170 
171     clear(rawRecord);
172     write(rawRecord, iter, (size_t)recordLen);
173     return recordLen;
174 }
175 
176 template <typename TBuffer, typename TForwardIter>
177 inline void
178 _readBamRecord(TBuffer & rawRecord, TForwardIter & iter, Bam)
179 {
180     int32_t recordLen = 0;
181     readRawPod(recordLen, iter);
182 
183     // fail, if we read "BAM\1" (did you miss to call readRecord(header, bamFile) first?)
184     if (recordLen == 0x014D4142)
185         SEQAN_THROW(ParseError("Unexpected BAM header encountered."));
186 
187     clear(rawRecord);
188     appendRawPod(rawRecord, recordLen);
189     write(rawRecord, iter, (size_t)recordLen);
190 }
191 
192 template <typename TForwardIter, typename TNameStore, typename TNameStoreCache, typename TStorageSpec>
193 inline void
194 readRecord(BamAlignmentRecord & record,
195            BamIOContext<TNameStore, TNameStoreCache, TStorageSpec> & context,
196            TForwardIter & iter,
197            Bam const & /* tag */)
198 {
199     typedef typename Iterator<CharString, Standard>::Type                             TCharIter;
200     typedef typename Iterator<String<CigarElement<> >, Standard>::Type SEQAN_RESTRICT TCigarIter;
201     typedef typename Iterator<IupacString, Standard>::Type SEQAN_RESTRICT             TSeqIter;
202     typedef typename Iterator<CharString, Standard>::Type SEQAN_RESTRICT              TQualIter;
203 
204     // Read size and data of the remaining block in one chunk (fastest).
205     int32_t remainingBytes = _readBamRecordWithoutSize(context.buffer, iter);
206     TCharIter it = begin(context.buffer, Standard());
207 
208     // BamAlignmentRecordCore.
209     arrayCopyForward(it, it + sizeof(BamAlignmentRecordCore), reinterpret_cast<char*>(&record));
210     enforceLittleEndian(*reinterpret_cast<BamAlignmentRecordCore*>(&record));
211     it += sizeof(BamAlignmentRecordCore);
212 
213     remainingBytes -= sizeof(BamAlignmentRecordCore) + record._l_qname +
214                       record._n_cigar * 4 + (record._l_qseq + 1) / 2 + record._l_qseq;
215     SEQAN_ASSERT_GEQ(remainingBytes, 0);
216 
217     // Translate file local rID into a global rID that is compatible with the context contigNames.
218     if (record.rID >= 0 && !empty(context.translateFile2GlobalRefId))
219         record.rID = context.translateFile2GlobalRefId[record.rID];
220     if (record.rID >= 0)
221         SEQAN_ASSERT_LT(static_cast<uint64_t>(record.rID), length(contigNames(context)));
222 
223     // ... the same for rNextId
224     if (record.rNextId >= 0 && !empty(context.translateFile2GlobalRefId))
225         record.rNextId = context.translateFile2GlobalRefId[record.rNextId];
226     if (record.rNextId >= 0)
227         SEQAN_ASSERT_LT(static_cast<uint64_t>(record.rNextId), length(contigNames(context)));
228 
229     // query name.
230     resize(record.qName, record._l_qname - 1, Exact());
231     arrayCopyForward(it, it + record._l_qname - 1, begin(record.qName, Standard()));
232     it += record._l_qname;
233 
234     // cigar string.
235     resize(record.cigar, record._n_cigar, Exact());
236     static char const * CIGAR_MAPPING = "MIDNSHP=X*******";
237     TCigarIter cigEnd = end(record.cigar, Standard());
238     for (TCigarIter cig = begin(record.cigar, Standard()); cig != cigEnd; ++cig)
239     {
240         uint32_t opAndCnt;
241         readRawPod(opAndCnt, it);
242         SEQAN_ASSERT_LEQ(opAndCnt & 15, 8u);
243         cig->operation = CIGAR_MAPPING[opAndCnt & 15];
244         cig->count = opAndCnt >> 4;
245     }
246 
247     // query sequence.
248     resize(record.seq, record._l_qseq, Exact());
249     TSeqIter sit = begin(record.seq, Standard());
250     TSeqIter sitEnd = sit + (record._l_qseq & ~1);
251     while (sit != sitEnd)
252     {
253         unsigned char ui = getValue(it);
254         ++it;
255         *sit = Iupac(ui >> 4);
256         ++sit;
257         *sit = Iupac(ui & 0x0f);
258         ++sit;
259     }
260     if (record._l_qseq & 1)
261         *sit++ = Iupac((uint8_t)*it++ >> 4);
262 
263     // phred quality
264     resize(record.qual, record._l_qseq, Exact());
265     // If qual is a sequence of 0xff (heuristic same as samtools: Only look at first byte) then we clear it, to get the
266     // representation of '*';
267     TQualIter qitEnd = end(record.qual, Standard());
268     for (TQualIter qit = begin(record.qual, Standard()); qit != qitEnd;)
269         *qit++ = '!' + *it++;
270     if (!empty(record.qual) && static_cast<char>(record.qual[0] - '!') == '\xff')
271         clear(record.qual);
272 
273     // tags
274     resize(record.tags, remainingBytes, Exact());
275     arrayCopyForward(it, it + remainingBytes, begin(record.tags, Standard()));
276 }
277 
278 }  // namespace seqan
279 
280 #endif  // #ifndef INCLUDE_SEQAN_BAM_IO_READ_BAM_H_
281