1 // ========================================================================== 2 // SeqAn - The Library for Sequence Analysis 3 // ========================================================================== 4 // Copyright (c) 2006-2018, Knut Reinert, FU Berlin 5 // All rights reserved. 6 // 7 // Redistribution and use in source and binary forms, with or without 8 // modification, are permitted provided that the following conditions are met: 9 // 10 // * Redistributions of source code must retain the above copyright 11 // notice, this list of conditions and the following disclaimer. 12 // * Redistributions in binary form must reproduce the above copyright 13 // notice, this list of conditions and the following disclaimer in the 14 // documentation and/or other materials provided with the distribution. 15 // * Neither the name of Knut Reinert or the FU Berlin nor the names of 16 // its contributors may be used to endorse or promote products derived 17 // from this software without specific prior written permission. 18 // 19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE 23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH 29 // DAMAGE. 30 // 31 // ========================================================================== 32 // Author: Manuel Holtgrewe <manuel.holtgrewe@fu-berlin.de> 33 // ========================================================================== 34 // Code for reading Bam. 35 // ========================================================================== 36 37 #ifndef INCLUDE_SEQAN_BAM_IO_READ_BAM_H_ 38 #define INCLUDE_SEQAN_BAM_IO_READ_BAM_H_ 39 40 namespace seqan { 41 42 // ============================================================================ 43 // Tags, Classes, Enums 44 // ============================================================================ 45 46 /*! 47 * @tag FileFormats#Bam 48 * @brief Identify the BAM format. 49 * 50 * @tag FileFormats#Sam 51 * @brief Identify the SAM format. 52 */ 53 54 struct Bam_; 55 typedef Tag<Bam_> Bam; 56 57 58 template <typename T> 59 struct FileExtensions<Bam, T> 60 { 61 static char const * VALUE[1]; // default is one extension 62 }; 63 64 template <typename T> 65 char const * FileExtensions<Bam, T>::VALUE[1] = 66 { 67 ".bam" // default output extension 68 }; 69 70 71 template <typename T> 72 struct MagicHeader<Bam, T> 73 { 74 static unsigned char const VALUE[4]; 75 }; 76 77 template <typename T> 78 unsigned char const MagicHeader<Bam, T>::VALUE[4] = { 'B', 'A', 'M', '\1' }; // BAM's magic header 79 80 // ============================================================================ 81 // Metafunctions 82 // ============================================================================ 83 84 // ============================================================================ 85 // Functions 86 // ============================================================================ 87 88 // ---------------------------------------------------------------------------- 89 // Function readHeader() BamHeader 90 // ---------------------------------------------------------------------------- 91 92 template <typename TForwardIter, typename TNameStore, typename TNameStoreCache, typename TStorageSpec> 93 inline void 94 readHeader(BamHeader & header, 95 BamIOContext<TNameStore, TNameStoreCache, TStorageSpec> & context, 96 TForwardIter & iter, 97 Bam const & /*tag*/) 98 { 99 clear(header); 100 // Read BAM magic string. 101 String<char, Array<4> > magic; 102 read(magic, iter, 4); 103 if (magic != "BAM\1") 104 SEQAN_THROW(ParseError("Not in BAM format.")); 105 106 // Read header text, including null padding. 107 int32_t lText; 108 readRawPod(lText, iter); 109 110 CharString samHeader; 111 write(samHeader, iter, lText); 112 113 // Truncate to first position of '\0'. 114 cropAfterFirst(samHeader, EqualsChar<'\0'>()); 115 116 // Parse out header records. 117 BamHeaderRecord headerRecord; 118 Iterator<CharString, Rooted>::Type it = begin(samHeader); 119 while (!atEnd(it)) 120 { 121 clear(headerRecord); 122 readRecord(headerRecord, context, it, Sam()); 123 appendValue(header, headerRecord); 124 } 125 126 // Read # reference sequences. 127 int32_t nRef; 128 readRawPod(nRef, iter); 129 CharString name; 130 131 clear(context.translateFile2GlobalRefId); 132 resize(context.translateFile2GlobalRefId, nRef, -1); 133 134 for (int32_t i = 0; i < nRef; ++i) 135 { 136 // Read length of the reference name. 137 int32_t nName; 138 readRawPod(nName, iter); 139 clear(name); 140 write(name, iter, nName); 141 resize(name, nName - 1); 142 // Read length of the reference sequence. 143 int32_t lRef; 144 readRawPod(lRef, iter); 145 146 // Add entry to name store and sequenceInfos if necessary. 147 // Compute translation from local ids (used in the BAM file) to corresponding ids in the name store 148 size_t globalRefId = nameToId(contigNamesCache(context), name); 149 context.translateFile2GlobalRefId[i] = globalRefId; 150 if (length(contigLengths(context)) <= globalRefId) 151 resize(contigLengths(context), globalRefId + 1, 0); 152 contigLengths(context)[globalRefId] = lRef; 153 } 154 } 155 156 // ---------------------------------------------------------------------------- 157 // Function readRecord() BamAlignmentRecord 158 // ---------------------------------------------------------------------------- 159 160 template <typename TBuffer, typename TForwardIter> 161 inline int32_t 162 _readBamRecordWithoutSize(TBuffer & rawRecord, TForwardIter & iter) 163 { 164 int32_t recordLen = 0; 165 readRawPod(recordLen, iter); 166 167 // fail, if we read "BAM\1" (did you miss to call readRecord(header, bamFile) first?) 168 if (recordLen == 0x014D4142) 169 SEQAN_THROW(ParseError("Unexpected BAM header encountered.")); 170 171 clear(rawRecord); 172 write(rawRecord, iter, (size_t)recordLen); 173 return recordLen; 174 } 175 176 template <typename TBuffer, typename TForwardIter> 177 inline void 178 _readBamRecord(TBuffer & rawRecord, TForwardIter & iter, Bam) 179 { 180 int32_t recordLen = 0; 181 readRawPod(recordLen, iter); 182 183 // fail, if we read "BAM\1" (did you miss to call readRecord(header, bamFile) first?) 184 if (recordLen == 0x014D4142) 185 SEQAN_THROW(ParseError("Unexpected BAM header encountered.")); 186 187 clear(rawRecord); 188 appendRawPod(rawRecord, recordLen); 189 write(rawRecord, iter, (size_t)recordLen); 190 } 191 192 template <typename TForwardIter, typename TNameStore, typename TNameStoreCache, typename TStorageSpec> 193 inline void 194 readRecord(BamAlignmentRecord & record, 195 BamIOContext<TNameStore, TNameStoreCache, TStorageSpec> & context, 196 TForwardIter & iter, 197 Bam const & /* tag */) 198 { 199 typedef typename Iterator<CharString, Standard>::Type TCharIter; 200 typedef typename Iterator<String<CigarElement<> >, Standard>::Type SEQAN_RESTRICT TCigarIter; 201 typedef typename Iterator<IupacString, Standard>::Type SEQAN_RESTRICT TSeqIter; 202 typedef typename Iterator<CharString, Standard>::Type SEQAN_RESTRICT TQualIter; 203 204 // Read size and data of the remaining block in one chunk (fastest). 205 int32_t remainingBytes = _readBamRecordWithoutSize(context.buffer, iter); 206 TCharIter it = begin(context.buffer, Standard()); 207 208 // BamAlignmentRecordCore. 209 arrayCopyForward(it, it + sizeof(BamAlignmentRecordCore), reinterpret_cast<char*>(&record)); 210 enforceLittleEndian(*reinterpret_cast<BamAlignmentRecordCore*>(&record)); 211 it += sizeof(BamAlignmentRecordCore); 212 213 remainingBytes -= sizeof(BamAlignmentRecordCore) + record._l_qname + 214 record._n_cigar * 4 + (record._l_qseq + 1) / 2 + record._l_qseq; 215 SEQAN_ASSERT_GEQ(remainingBytes, 0); 216 217 // Translate file local rID into a global rID that is compatible with the context contigNames. 218 if (record.rID >= 0 && !empty(context.translateFile2GlobalRefId)) 219 record.rID = context.translateFile2GlobalRefId[record.rID]; 220 if (record.rID >= 0) 221 SEQAN_ASSERT_LT(static_cast<uint64_t>(record.rID), length(contigNames(context))); 222 223 // ... the same for rNextId 224 if (record.rNextId >= 0 && !empty(context.translateFile2GlobalRefId)) 225 record.rNextId = context.translateFile2GlobalRefId[record.rNextId]; 226 if (record.rNextId >= 0) 227 SEQAN_ASSERT_LT(static_cast<uint64_t>(record.rNextId), length(contigNames(context))); 228 229 // query name. 230 resize(record.qName, record._l_qname - 1, Exact()); 231 arrayCopyForward(it, it + record._l_qname - 1, begin(record.qName, Standard())); 232 it += record._l_qname; 233 234 // cigar string. 235 resize(record.cigar, record._n_cigar, Exact()); 236 static char const * CIGAR_MAPPING = "MIDNSHP=X*******"; 237 TCigarIter cigEnd = end(record.cigar, Standard()); 238 for (TCigarIter cig = begin(record.cigar, Standard()); cig != cigEnd; ++cig) 239 { 240 uint32_t opAndCnt; 241 readRawPod(opAndCnt, it); 242 SEQAN_ASSERT_LEQ(opAndCnt & 15, 8u); 243 cig->operation = CIGAR_MAPPING[opAndCnt & 15]; 244 cig->count = opAndCnt >> 4; 245 } 246 247 // query sequence. 248 resize(record.seq, record._l_qseq, Exact()); 249 TSeqIter sit = begin(record.seq, Standard()); 250 TSeqIter sitEnd = sit + (record._l_qseq & ~1); 251 while (sit != sitEnd) 252 { 253 unsigned char ui = getValue(it); 254 ++it; 255 *sit = Iupac(ui >> 4); 256 ++sit; 257 *sit = Iupac(ui & 0x0f); 258 ++sit; 259 } 260 if (record._l_qseq & 1) 261 *sit++ = Iupac((uint8_t)*it++ >> 4); 262 263 // phred quality 264 resize(record.qual, record._l_qseq, Exact()); 265 // If qual is a sequence of 0xff (heuristic same as samtools: Only look at first byte) then we clear it, to get the 266 // representation of '*'; 267 TQualIter qitEnd = end(record.qual, Standard()); 268 for (TQualIter qit = begin(record.qual, Standard()); qit != qitEnd;) 269 *qit++ = '!' + *it++; 270 if (!empty(record.qual) && static_cast<char>(record.qual[0] - '!') == '\xff') 271 clear(record.qual); 272 273 // tags 274 resize(record.tags, remainingBytes, Exact()); 275 arrayCopyForward(it, it + remainingBytes, begin(record.tags, Standard())); 276 } 277 278 } // namespace seqan 279 280 #endif // #ifndef INCLUDE_SEQAN_BAM_IO_READ_BAM_H_ 281