1 // ==========================================================================
2 // SeqAn - The Library for Sequence Analysis
3 // ==========================================================================
4 // Copyright (c) 2006-2010, Knut Reinert, FU Berlin
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are met:
9 //
10 // * Redistributions of source code must retain the above copyright
11 // notice, this list of conditions and the following disclaimer.
12 // * Redistributions in binary form must reproduce the above copyright
13 // notice, this list of conditions and the following disclaimer in the
14 // documentation and/or other materials provided with the distribution.
15 // * Neither the name of Knut Reinert or the FU Berlin nor the names of
16 // its contributors may be used to endorse or promote products derived
17 // from this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29 // DAMAGE.
30 //
31 // ==========================================================================
32 // Author: David Weese <david.weese@fu-berlin.de>
33 // ==========================================================================
34
35 #include <iostream>
36
37 #ifndef SEQAN_HEADER_STORE_IO_SAM_H
38 #define SEQAN_HEADER_STORE_IO_SAM_H
39
40 namespace SEQAN_NAMESPACE_MAIN
41 {
42
43 //////////////////////////////////////////////////////////////////////////////
44 // File tags
45 //////////////////////////////////////////////////////////////////////////////
46
47 /**
48 .Tag.File Format.tag.Sam:
49 Sam mapping file.
50 ..include:seqan/store.h
51 */
52 struct Sam_;
53 typedef Tag<Sam_> const Sam;
54
55
56 //////////////////////////////////////////////////////////////////////////////
57 // CIGAR struct
58 //////////////////////////////////////////////////////////////////////////////
59
60 #if SEQAN_HAS_SAMTOOLS
61 struct FromBam_;
62 typedef Tag<FromBam_> FromBam;
63 #endif // #if SEQAN_HAS_SAMTOOLS
64
65 template <typename TOperation_ = char, typename TCount_ = unsigned>
66 struct CigarElement
67 {
68 typedef TOperation_ TOperation;
69 typedef TCount_ TCount;
70
71 TOperation operation;
72 TCount count;
73
CigarElementCigarElement74 CigarElement() : operation(0), count(0) {}
75
CigarElementCigarElement76 CigarElement(TOperation o, TCount c):
77 operation(o),
78 count(c) {}
79
80 #if SEQAN_HAS_SAMTOOLS
CigarElementCigarElement81 CigarElement(__uint32 bamCigarElement, FromBam const &)
82 {
83 SEQAN_ASSERT_LEQ(bamCigarElement & BAM_CIGAR_MASK, 8u);
84 operation = "MIDNSHP=X"[bamCigarElement & BAM_CIGAR_MASK];
85 count = bamCigarElement >> 4;
86 }
87 #endif // #if SEQAN_HAS_SAMTOOLS
88 };
89
90 template <typename TOperation, typename TCount>
toBamCigarElement(CigarElement<TOperation,TCount> const & cigarElement)91 __uint32 toBamCigarElement(CigarElement<TOperation, TCount> const & cigarElement)
92 {
93 char operation = 0;
94 switch (cigarElement.operation) {
95 case 'X': operation += 1;
96 case '=': operation += 1;
97 case 'P': operation += 1;
98 case 'H': operation += 1;
99 case 'S': operation += 1;
100 case 'N': operation += 1;
101 case 'D': operation += 1;
102 case 'I': operation += 1;
103 case 'M': break;
104 }
105 return (cigarElement.count << 4) | operation;
106 }
107
108 //____________________________________________________________________________
109
110 template <
111 typename TCigar,
112 typename TGaps1,
113 typename TGaps2,
114 typename TThresh>
115 inline void
getCigarString(TCigar & cigar,TGaps1 & gaps1,TGaps2 & gaps2,TThresh splicedGapThresh)116 getCigarString(
117 TCigar &cigar,
118 TGaps1 &gaps1,
119 TGaps2 &gaps2,
120 TThresh splicedGapThresh)
121 {
122 typename Iterator<TGaps1>::Type it1 = begin(gaps1);
123 typename Iterator<TGaps2>::Type it2 = begin(gaps2);
124 clear(cigar);
125 char op, lastOp = ' ';
126 unsigned numOps = 0;
127
128 // std::cout << gaps1 << std::endl;
129 // std::cout << gaps2 << std::endl;
130 for (; !atEnd(it1) && !atEnd(it2); goNext(it1), goNext(it2))
131 {
132 if (isGap(it1))
133 {
134 if (isGap(it2))
135 op = 'P';
136 else if (isClipped(it2))
137 op = '?';
138 else
139 op = 'I';
140 }
141 else if (isClipped(it1))
142 {
143 op = '?';
144 }
145 else
146 {
147 if (isGap(it2))
148 op = 'D';
149 else if (isClipped(it2))
150 op = 'S';
151 else
152 op = 'M';
153 }
154 if (lastOp != op)
155 {
156 if (lastOp == 'D' && numOps >= (unsigned)splicedGapThresh)
157 lastOp = 'N';
158 if (numOps > 0)
159 {
160 std::stringstream num;
161 num << numOps;
162 append(cigar, num.str());
163 appendValue(cigar, lastOp);
164 }
165 numOps = 0;
166 lastOp = op;
167 }
168 ++numOps;
169 }
170 SEQAN_ASSERT_EQ(atEnd(it1), atEnd(it2));
171 if (lastOp == 'D' && numOps >= (unsigned)splicedGapThresh)
172 lastOp = 'N';
173 if (numOps > 0)
174 {
175 std::stringstream num;
176 num << numOps;
177 append(cigar, num.str());
178 appendValue(cigar, lastOp);
179 }
180 }
181
182 template <
183 typename TCigar,
184 typename TGaps1,
185 typename TGaps2>
186 inline void
getCigarString(TCigar & cigar,TGaps1 & gaps1,TGaps2 & gaps2)187 getCigarString(
188 TCigar &cigar,
189 TGaps1 &gaps1,
190 TGaps2 &gaps2)
191 {
192 return getCigarString(cigar, gaps1, gaps2, 20);
193 }
194
195 template <
196 typename TOperation,
197 typename TCount,
198 typename TSpec,
199 typename TGaps1,
200 typename TGaps2,
201 typename TThresh>
202 inline void
getCigarString(String<CigarElement<TOperation,TCount>,TSpec> & cigar,TGaps1 & gaps1,TGaps2 & gaps2,TThresh splicedGapThresh)203 getCigarString(
204 String<CigarElement<TOperation, TCount>, TSpec> &cigar,
205 TGaps1 &gaps1,
206 TGaps2 &gaps2,
207 TThresh splicedGapThresh)
208 {
209 typename Iterator<TGaps1>::Type it1 = begin(gaps1);
210 typename Iterator<TGaps2>::Type it2 = begin(gaps2);
211 clear(cigar);
212 char op, lastOp = ' ';
213 unsigned numOps = 0;
214
215 // std::cout << gaps1 << std::endl;
216 // std::cout << gaps2 << std::endl;
217 for (; !atEnd(it1) && !atEnd(it2); goNext(it1), goNext(it2))
218 {
219 if (isGap(it1))
220 {
221 if (isGap(it2))
222 op = 'P';
223 else if (isClipped(it2))
224 op = '?';
225 else
226 op = 'I';
227 }
228 else if (isClipped(it1))
229 {
230 op = '?';
231 }
232 else
233 {
234 if (isGap(it2))
235 op = 'D';
236 else if (isClipped(it2))
237 op = 'S';
238 else
239 op = 'M';
240 }
241 if (lastOp != op)
242 {
243 if (lastOp == 'D' && numOps >= (unsigned)splicedGapThresh)
244 lastOp = 'N';
245 if (numOps > 0)
246 appendValue(cigar, CigarElement<>(op, numOps));
247 numOps = 0;
248 lastOp = op;
249 }
250 ++numOps;
251 }
252 SEQAN_ASSERT_EQ(atEnd(it1), atEnd(it2));
253 if (lastOp == 'D' && numOps >= (unsigned)splicedGapThresh)
254 lastOp = 'N';
255 if (numOps > 0)
256 appendValue(cigar, CigarElement<>(op, numOps));
257 }
258
259 //////////////////////////////////////////////////////////////////////////////
260 // _getClippedLength
261
262 template <typename TCigarString, typename TNum>
_getClippedLength(TCigarString const & cigar,TNum & sum)263 inline void _getClippedLength(TCigarString const & cigar, TNum & sum)
264 {
265 typedef typename Iterator<TCigarString, Standard>::Type TCigarIter;
266
267 TCigarIter it = begin(cigar, Standard());
268 TCigarIter itEnd = end(cigar, Standard());
269
270 sum = 0;
271 for (; it != itEnd; ++it)
272 if (getValue(it).operation != 'S' && getValue(it).operation != 'H')
273 sum += getValue(it).count;
274 }
275
276 //////////////////////////////////////////////////////////////////////////////
277 // convert CIGAR to gaps
278
279 template<typename TCigarString, typename TGaps>
cigarToGapAnchorRead(TCigarString const & cigar,TGaps & gaps)280 inline void cigarToGapAnchorRead(TCigarString const & cigar, TGaps & gaps)
281 {
282 typename Iterator<TGaps>::Type it = begin(gaps);
283 for (unsigned i = 0; i < length(cigar); ++i)
284 {
285 switch (cigar[i].operation)
286 {
287 case 'D':
288 case 'N':
289 case 'P':
290 insertGaps(it, cigar[i].count);
291 case 'I':
292 case 'M':
293 case 'S':
294 it += cigar[i].count;
295 }
296 }
297 }
298
299 template<typename TCigarString, typename TGaps>
cigarToGapAnchorContig(TCigarString const & cigar,TGaps & gaps)300 inline void cigarToGapAnchorContig(TCigarString const & cigar, TGaps & gaps)
301 {
302 typename Iterator<TGaps>::Type it = begin(gaps);
303 for (unsigned i = 0; i < length(cigar); ++i)
304 {
305 switch (cigar[i].operation)
306 {
307 case 'I':
308 case 'P':
309 insertGaps(it, cigar[i].count);
310 case 'D':
311 case 'M':
312 case 'N':
313 case 'S':
314 it += cigar[i].count;
315 }
316 }
317 }
318
319
320
321 //////////////////////////////////////////////////////////////////////////////
322 // Parsing functions
323 //////////////////////////////////////////////////////////////////////////////
324
325
326 //////////////////////////////////////////////////////////////////////////////
327 // _parseReadCigar
328
329 template <typename TFile, typename TCigarString, typename TChar>
330 inline void
_parseReadCigar(TFile & file,TCigarString & cigar,TChar & c)331 _parseReadCigar(TFile & file, TCigarString & cigar, TChar & c)
332 {
333 typedef typename Value<TCigarString>::Type TCigarElement;
334 typedef typename TCigarElement::TOperation TOperation;
335 typedef typename TCigarElement::TCount TCount;
336
337 clear(cigar);
338
339 // if the CIGAR is not set and '*'
340 if (c == '*')
341 {
342 c = _streamGet(file);
343 return;
344 }
345
346 while (!_streamEOF(file))
347 {
348 TCount count = _parseReadNumber(file, c);
349 if (c >= 'a' && c <= 'z')
350 c = c + 'A' - 'a';
351 appendValue(cigar, TCigarElement(c, count));
352
353 c = _streamGet(file);
354 if (c == ' ' || c == '\t' || c == '\n') break;
355 }
356 }
357
358 //////////////////////////////////////////////////////////////////////////////
359 // _parseReadSamIdentifier
360
361 template<typename TFile, typename TString, typename TChar>
362 inline void
_parseReadSamIdentifier(TFile & file,TString & str,TChar & c)363 _parseReadSamIdentifier(TFile & file, TString & str, TChar& c)
364 {
365 if (c == ' ' || c == '\t' || c == '\n') return;
366 appendValue(str, c);
367 while (!_streamEOF(file))
368 {
369 c = _streamGet(file);
370 if (c == ' ' || c == '\t' || c == '\n') return;
371 appendValue(str, c);
372 }
373 }
374
375 //////////////////////////////////////////////////////////////////////////////
376 // _parseIsDna
377
378 template<typename TChar>
379 inline bool
_parseIsDna(TChar const & c)380 _parseIsDna(TChar const & c)
381 {
382 char x = TChar(Dna5(c));
383 return (c == x) || (c + 'A' - 'a' == x);
384 }
385
386 //////////////////////////////////////////////////////////////////////////////
387 //_parseReadDnaSeq
388
389 template<typename TFile, typename TString, typename TChar>
390 inline void
_parseReadDnaSeq(TFile & file,TString & str,TChar & c)391 _parseReadDnaSeq(TFile & file, TString & str, TChar & c)
392 {
393 TChar first = c;
394 if (!_streamEOF(file))
395 c = _streamGet(file);
396
397 if (!_parseIsDna(first))
398 return;
399 appendValue(str, first, Generous());
400
401 for (; !_streamEOF(file) && _parseIsDna(c); c = _streamGet(file))
402 appendValue(str, c, Generous());
403 }
404
405 //////////////////////////////////////////////////////////////////////////////
406 // _parseIsPhredQual
407
408 template <typename TChar>
409 inline bool
_parseIsPhredQual(TChar c)410 _parseIsPhredQual(TChar c)
411 {
412 return c >= '!' && c <= '~';
413 }
414
415 //////////////////////////////////////////////////////////////////////////////
416 // _parseReadSeqQual
417 //
418
419 template<typename TFile, typename TQualString, typename TChar>
420 inline void
_parseReadSeqQual(TFile & file,TQualString & str,TChar & c)421 _parseReadSeqQual(TFile & file, TQualString & str, TChar & c)
422 {
423 typedef typename Size<TQualString>::Type TSize;
424 typedef typename Iterator<TQualString, Standard>::Type TIter;
425
426 TIter itBegin = begin(str, Standard());
427 TSize rest = length(str);
428
429 int q = 0;
430 for (TIter it = itBegin; rest != 0 && _parseIsPhredQual(c); --rest, ++it)
431 {
432 q = c - '!';
433 if (!_streamEOF(file))
434 c = _streamGet(file);
435 else
436 if (rest > 1)
437 rest = 1;
438
439 if (q == '*' - '!' && !_parseIsPhredQual(c) && it == itBegin)
440 return;
441
442 assignQualityValue(*it, q);
443 }
444 }
445
446 //////////////////////////////////////////////////////////////////////////////
447 // _parseReadCharsUntilEndOfLine
448 //
449 // Reads all symbols till the next '\n' and writes them in the CharString str
450 // the c is the first character after the '\n'.
451
452 template<typename TFile, typename TChar>
453 inline void
_parseReadCharsUntilEndOfLine(TFile & file,String<char> & str,TChar & c)454 _parseReadCharsUntilEndOfLine(TFile & file, String<char> & str, TChar& c)
455 {
456 // read all chars till '\n'
457 while (c != '\n')
458 {
459 appendValue(str, c, Generous());
460 if (_streamEOF(file)) return;
461 c = _streamGet(file);
462 }
463
464 // read the first char after the '\n'
465 if (!_streamEOF(file))
466 c = _streamGet(file);
467 }
468
469 //////////////////////////////////////////////////////////////////////////////
470 // appendAlignment
471
472 template<typename TSpec, typename TConfig, typename TId, typename TPos, typename TGaps>
473 inline typename Size<typename FragmentStore<TSpec, TConfig>::TAlignedReadStore>::Type
appendAlignment(FragmentStore<TSpec,TConfig> & fragStore,TId readId,TId contigId,TPos beginPos,TPos endPos,TGaps const & gaps)474 appendAlignment(
475 FragmentStore<TSpec, TConfig> & fragStore,
476 TId readId,
477 TId contigId,
478 TPos beginPos,
479 TPos endPos,
480 TGaps const & gaps)
481 {
482 typedef FragmentStore<TSpec, TConfig> TFragmentStore;
483 typedef typename Value<typename TFragmentStore::TAlignedReadStore>::Type TAlignedElement;
484
485 TId id = length(fragStore.alignedReadStore);
486 TAlignedElement alignedElem = TAlignedElement(id, readId, contigId, beginPos, endPos, gaps);
487 appendValue(fragStore.alignedReadStore, alignedElem);
488
489 return id;
490 }
491
492
493
494 //////////////////////////////////////////////////////////////////////////////
495 // read functions for Sam
496 //////////////////////////////////////////////////////////////////////////////
497
498
499 //////////////////////////////////////////////////////////////////////////////
500 // _generatePairMatchIds
501 //
502 template <typename TPos, typename TId>
503 struct MatchMateInfo_
504 {
505 TId readId;
506 TId contigId;
507 TId pairMatchId;
508 TPos beginPos;
509 };
510
511 struct MatchMateInfoLess_
512 {
513 template <typename TMInfo>
514 inline bool
operatorMatchMateInfoLess_515 operator() (TMInfo const &a, TMInfo const &b) const
516 {
517 return (a.contigId < b.contigId) || (a.contigId == b.contigId && a.beginPos < b.beginPos);
518 }
519 };
520
521 template<typename TSpec, typename TConfig, typename TMatchMateInfos>
522 inline void
_generatePairMatchIds(FragmentStore<TSpec,TConfig> & fragStore,TMatchMateInfos & matchMateInfos)523 _generatePairMatchIds (
524 FragmentStore<TSpec, TConfig> & fragStore,
525 TMatchMateInfos & matchMateInfos)
526 {
527 typedef FragmentStore<TSpec, TConfig> TFragmentStore;
528
529 typedef typename TFragmentStore::TReadStore TReadStore;
530 typedef typename TFragmentStore::TAlignedReadStore TAlignedReadStore;
531 typedef typename TFragmentStore::TContigPos TContigPos;
532
533 typedef typename Value<TReadStore>::Type TRead;
534 typedef typename Value<TAlignedReadStore>::Type TAlignedRead;
535 typedef typename Iterator<TAlignedReadStore, Standard>::Type TIter;
536 typedef typename Iterator<TMatchMateInfos, Standard>::Type TMIter;
537 typedef typename Id<TFragmentStore>::Type TId;
538
539 TIter it = begin(fragStore.alignedReadStore, Standard());
540 TIter itEnd = end(fragStore.alignedReadStore, Standard());
541 TMIter mit = begin(matchMateInfos, Standard());
542 TMIter mitEnd = end(matchMateInfos, Standard());
543
544 if (it == itEnd || mit == mitEnd) return;
545
546 // sort the aligned read store by: begin position, contig name
547 sortAlignedReads(fragStore.alignedReadStore, SortBeginPos());
548 sortAlignedReads(fragStore.alignedReadStore, SortContigId());
549 std::sort(mit, mitEnd, MatchMateInfoLess_());
550
551 TMIter mitNext = mit;
552 while (mitNext != mitEnd && (*mit).beginPos == (*mitNext).beginPos)
553 ++mitNext;
554
555 TContigPos pos = _min((*it).beginPos, (*it).endPos);
556 typename Size<TReadStore>::Type readStoreLength = length(fragStore.readStore);
557
558 while (mit != mitEnd)
559 {
560 int cmp = 0;
561 if ((*it).contigId < (*mit).contigId) cmp = -1;
562 else if ((*it).contigId > (*mit).contigId) cmp = 1;
563 else if (pos < (*mit).beginPos) cmp = -1;
564 else if (pos > (*mit).beginPos) cmp = 1;
565
566 if (cmp == 1)
567 {
568 mit = mitNext;
569 while (mitNext != mitEnd && (*mit).beginPos == (*mitNext).beginPos)
570 ++mitNext;
571 continue;
572 }
573 if (cmp == 0)
574 {
575 for (TMIter m = mit; m != mitNext; ++m)
576 if ((*it).readId != (*m).readId && (*it).readId < readStoreLength && (*m).readId < readStoreLength) // correct position found
577 if (fragStore.readStore[(*it).readId].matePairId == fragStore.readStore[(*m).readId].matePairId) // correct mate found
578 (*it).pairMatchId = (*m).pairMatchId; // link mate
579 }
580 if (++it == itEnd) break;
581 pos = _min((*it).beginPos, (*it).endPos);
582 }
583 }
584
585 //////////////////////////////////////////////////////////////////////////////
586 // read
587
588 ///.Function.read.param.tag.type:Tag.File Format.tag.Sam
589
590 template<typename TFile, typename TSpec, typename TConfig>
591 inline void
read(TFile & file,FragmentStore<TSpec,TConfig> & fragStore,Sam)592 read(
593 TFile & file,
594 FragmentStore<TSpec, TConfig> & fragStore,
595 Sam)
596 {
597 typedef Value<FILE>::Type TValue;
598 typedef FragmentStore<TSpec, TConfig> TFragmentStore;
599 typedef typename TFragmentStore::TContigPos TContigPos;
600 typedef typename Id<TFragmentStore>::Type TId;
601
602 // data structure to temporarily store the gaps that need to be inserted in the contig sequences
603 typedef MatchMateInfo_<TContigPos, TId> TMatchMateInfo;
604 typedef String<TMatchMateInfo> TMatchMateInfos;
605 typedef StringSet<String<typename TFragmentStore::TContigGapAnchor> > TContigAnchorGaps;
606
607 // data structure to temporarily store information about match mates
608 TMatchMateInfos matchMateInfos;
609 TContigAnchorGaps contigAnchorGaps;
610
611 if (_streamEOF(file)) return;
612
613 // get first character from the stream
614 char c = _streamGet(file);
615
616 // Read in header section
617 _readHeader(file, fragStore, c, Sam());
618
619 // Read in alignments section
620 _readAlignments(file, fragStore, contigAnchorGaps, matchMateInfos, c, Sam());
621
622 // set the match mate IDs using the information stored in matchMateInfos
623 _generatePairMatchIds(fragStore, matchMateInfos);
624
625 convertPairWiseToGlobalAlignment(fragStore, contigAnchorGaps);
626 }
627
628 //////////////////////////////////////////////////////////////////////////////
629 // _readHeader
630
631 template<typename TFile, typename TSpec, typename TConfig, typename TChar>
632 inline void
_readHeader(TFile & file,FragmentStore<TSpec,TConfig> &,TChar & c,Sam)633 _readHeader (
634 TFile & file,
635 FragmentStore<TSpec, TConfig> &,
636 TChar & c,
637 Sam)
638 {
639 // skip header for now
640 while (c == '@')
641 _parseSkipLine(file, c);
642 }
643
644 //////////////////////////////////////////////////////////////////////////////
645 // _readAlignments
646 //
647 // reads in alignement sections from a Sam file
648
649 template<typename TFile, typename TSpec, typename TConfig, typename TContigAnchorGaps, typename TMatchMateInfos, typename TChar>
650 inline void
_readAlignments(TFile & file,FragmentStore<TSpec,TConfig> & fragStore,TContigAnchorGaps & contigAnchorGaps,TMatchMateInfos & matchMateInfos,TChar & c,Sam)651 _readAlignments (
652 TFile & file,
653 FragmentStore<TSpec, TConfig> & fragStore,
654 TContigAnchorGaps & contigAnchorGaps,
655 TMatchMateInfos & matchMateInfos,
656 TChar & c,
657 Sam)
658 {
659 // create dummy entries in Sam specific aligned read quality store and aligned read tag store
660 // is needed so the ID in the aligned store can be use to access the other stores
661 // even if there exists previous entries without
662 typedef FragmentStore<TSpec, TConfig> TFragmentStore;
663 typedef typename TFragmentStore::TAlignQualityStore TAlignQualityStore;
664 typedef typename TFragmentStore::TNameStore TNameStore;
665 typedef typename Value<TAlignQualityStore>::Type TAlignQuality;
666
667 TAlignQuality q;
668 q.score = maxValue(q.score);
669 int diff = length(fragStore.alignedReadStore) - length(fragStore.alignQualityStore);
670 for(int i = 0; i < diff; ++i)
671 appendValue(fragStore.alignQualityStore, q, Generous());
672
673 diff = length(fragStore.alignedReadStore) - length(fragStore.alignedReadTagStore);
674 for(int i = 0; i < diff; ++i)
675 appendValue(fragStore.alignedReadTagStore, "", Generous());
676
677 // read in alignments
678 Nothing contextSAM;
679 refresh(fragStore.contigNameStoreCache);
680 refresh(fragStore.readNameStoreCache);
681
682 while (!_streamEOF(file))
683 _readOneAlignment(file, fragStore, contigAnchorGaps, matchMateInfos, c, Sam(), contextSAM);
684 }
685
686
687 //////////////////////////////////////////////////////////////////////////////
688 // _readOneAlignment
689 //
690 // reads in one alignement section from a Sam file
691
692 template<typename TFile, typename TSpec, typename TConfig, typename TContigAnchorGaps, typename TMatchMateInfos, typename TChar, typename TContextSAM>
693 inline void
_readOneAlignment(TFile & file,FragmentStore<TSpec,TConfig> & fragStore,TContigAnchorGaps & contigAnchorGaps,TMatchMateInfos & matchMateInfos,TChar & c,Sam,TContextSAM & contextSAM)694 _readOneAlignment (
695 TFile & file,
696 FragmentStore<TSpec, TConfig> & fragStore,
697 TContigAnchorGaps & contigAnchorGaps,
698 TMatchMateInfos & matchMateInfos,
699 TChar & c,
700 Sam,
701 TContextSAM & contextSAM)
702 {
703 // Basic types
704 typedef FragmentStore<TSpec, TConfig> TFragmentStore;
705 typedef typename Id<TFragmentStore>::Type TId;
706 typedef typename Size<TFragmentStore>::Type TSize;
707
708 // All fragment store element types
709 typedef typename Value<typename TFragmentStore::TContigStore>::Type TContigElement;
710 typedef typename Value<typename TFragmentStore::TLibraryStore>::Type TLibraryStoreElement;
711 typedef typename Value<typename TFragmentStore::TMatePairStore>::Type TMatePairElement;
712 typedef typename Value<typename TFragmentStore::TReadStore>::Type TReadStoreElement;
713 typedef typename Value<typename TFragmentStore::TAlignedReadStore>::Type TAlignedElement;
714 typedef typename Value<typename TFragmentStore::TAlignQualityStore>::Type TAlignQualityElement;
715 typedef typename TAlignedElement::TGapAnchors TReadGapAnchors;
716
717 // Type for sequence in readstore
718 typedef typename TFragmentStore::TReadSeq TReadSeq2;
719
720 // Type for gap anchor
721 typedef typename TFragmentStore::TContigPos TContigPos;
722 typedef Gaps<TReadSeq2, AnchorGaps<TReadGapAnchors> > TReadGaps;
723 typedef Gaps<Nothing, AnchorGaps<typename Value<TContigAnchorGaps>::Type> > TContigGapsPW;
724
725 // Type to temporarily store information about match mates
726 typedef typename Value<TMatchMateInfos>::Type TMatchMateInfo;
727
728 // read fields of alignments line
729 _parseSkipWhitespace(file, c);
730
731 // Read the query name. The letters until the first
732 // whitespace will be read into qname. Then, we skip until we
733 // hit the first tab character.
734 String<char> qname;
735 _parseReadSamIdentifier(file, qname, c);
736 _parseSkipUntilChar(file, '\t', c);
737
738 // read the flag
739 int flag;
740 flag = _parseReadNumber(file, c);
741 _parseSkipWhitespace(file, c);
742 bool reverse = (flag & (1 << 4)) == (1 << 4);
743
744 // Read reference name. Same behaviour as for query name: Read up to
745 // the first whitespace character and skip to next tab char.
746 String<char> rname;
747 _parseReadSamIdentifier(file, rname, c);
748 _parseSkipUntilChar(file, '\t', c);
749
750 // read begin position
751 TContigPos beginPos;
752 beginPos = _parseReadNumber(file, c);
753 --beginPos; // Sam stores positions starting at 1 the fragment store starting at 0
754 _parseSkipWhitespace(file, c);
755
756 // read map quality
757 TAlignQualityElement mapQ;
758 mapQ.score = _parseReadNumber(file, c);
759 _parseSkipWhitespace(file, c);
760
761 // read CIGAR
762 String<CigarElement<> > cigar;
763 _parseReadCigar(file, cigar, c);
764 _parseSkipWhitespace(file, c);
765
766 // calculate the end position
767 TContigPos endPos;
768 _getClippedLength(cigar, endPos);
769 endPos = beginPos + endPos;
770 // if the read is on the antisense strand switch begin and end position
771 if (reverse)
772 {
773 TContigPos temp = beginPos;
774 beginPos = endPos;
775 endPos = temp;
776 }
777
778 // generate gap anchor string for the read
779 TReadGapAnchors readGapAnchors;
780
781 // read mate reference name
782 String<char> mrnm;
783 _parseReadSamIdentifier(file, mrnm, c);
784 _parseSkipWhitespace(file, c);
785
786 // read mate position
787 TContigPos mPos;
788 mPos = _parseReadNumber(file, c);
789 --mPos; // Sam stores positions starting at 1 the fragment store starting at 0
790 _parseSkipWhitespace(file, c);
791
792 // read iSize
793 _parseReadNumber(file, c);
794 _parseSkipWhitespace(file, c);
795
796 // read in sequence
797 TReadSeq2 readSeq;
798 _parseReadDnaSeq(file, readSeq, c);
799 SEQAN_ASSERT_GT(length(readSeq), 0u);
800 if (reverse) // TODO(holtgrew): Why not after parsing quality? ib. for BAM...
801 reverseComplement(readSeq);
802 _parseSkipWhitespace(file, c);
803
804 // and associated qualities
805 _parseReadSeqQual(file, readSeq, c);
806
807 // insert alignment gaps
808 TReadGaps readGaps(readSeq, readGapAnchors);
809 cigarToGapAnchorRead(cigar, readGaps);
810
811 // read in Sam tags
812 String<char> tags;
813 _parseSkipSpace(file, c);
814 _parseReadCharsUntilEndOfLine(file, tags, c);
815
816 if (empty(qname) || empty(rname))
817 return;
818
819 // check if read sequence is already in the store.
820 // if so get the ID, otherwise create new entries in the
821 // read, read name and mate pair store
822
823 TId readId = 0;
824 _storeAppendRead(fragStore, readId, qname, readSeq, flag, contextSAM);
825
826 // check if the contig is already in the store
827 // get its ID or create a new one otherwise
828 TId contigId = 0;
829 _storeAppendContig(fragStore, contigId, rname);
830
831 if (empty(cigar)) return;
832
833 // create a new entry in the aligned read store
834 TId pairMatchId = appendAlignment(fragStore, readId, contigId, beginPos, endPos, readGapAnchors);
835 resize(contigAnchorGaps, length(fragStore.alignedReadStore), Generous());
836 TContigGapsPW contigGaps(back(contigAnchorGaps));
837 cigarToGapAnchorContig(cigar, contigGaps);
838
839 // create entries in Sam specific stores
840 appendValue(fragStore.alignQualityStore, mapQ, Generous());
841 appendValue(fragStore.alignedReadTagStore, tags, Generous());
842
843 // store additional data about match mate temporarily
844 // used in the end of the read function to generate match mate IDs
845 TId mcontigId = contigId;
846 if (mrnm != "*")
847 {
848 if (mrnm != "=")
849 _storeAppendContig(fragStore, mcontigId, mrnm);
850
851 if (flag & 0x40) // store mate info only for the first read in the pair
852 {
853 TMatchMateInfo matchMateInfo = {readId, mcontigId, pairMatchId, mPos};
854 appendValue(matchMateInfos, matchMateInfo);
855 back(fragStore.alignedReadStore).pairMatchId = pairMatchId;
856 }
857 }
858 }
859
860
861 //////////////////////////////////////////////////////////////////////////////
862 // write functions for Sam
863 //////////////////////////////////////////////////////////////////////////////
864
865 //////////////////////////////////////////////////////////////////////////////
866 // _writeAlignments
867
868 template<typename TFile, typename TSpec, typename TConfig>
_writeHeader(TFile & target,FragmentStore<TSpec,TConfig> & store,Sam)869 inline void _writeHeader(TFile & target,
870 FragmentStore<TSpec, TConfig> & store,
871 Sam)
872 {
873 typedef FragmentStore<TSpec, TConfig> TFragmentStore;
874 typedef typename TFragmentStore::TLibraryStore TLibraryStore;
875 typedef typename TFragmentStore::TContigStore TContigStore;
876 typedef typename TFragmentStore::TNameStore TNameStore;
877
878 typedef typename Value<TContigStore>::Type TContig;
879 typedef typename Iterator<TLibraryStore, Standard>::Type TLibraryIter;
880 typedef typename Iterator<TContigStore, Standard>::Type TContigIter;
881 typedef typename Iterator<TNameStore, Standard>::Type TContigNameIter;
882 typedef typename Id<TContig>::Type TId;
883
884 TContigIter it = begin(store.contigStore, Standard());
885 TContigIter itEnd = end(store.contigStore, Standard());
886 TContigNameIter nit = begin(store.contigNameStore, Standard());
887 TContigNameIter nitEnd = end(store.contigNameStore, Standard());
888
889 _streamWrite(target, "@HD\tVN:1.0\n");
890 for(; it != itEnd; ++it)
891 {
892 _streamWrite(target, "@SQ\tSN:");
893 if (nit != nitEnd)
894 {
895 _streamWrite(target, *nit);
896 ++nit;
897 }
898 _streamWrite(target, "\tLN:");
899 _streamPutInt(target, length((*it).seq));
900 _streamPut(target, '\n');
901 }
902
903 TLibraryIter lit = begin(store.libraryStore, Standard());
904 TLibraryIter litEnd = end(store.libraryStore, Standard());
905 for(TId id = 0; lit != litEnd; ++lit, ++id)
906 {
907 _streamWrite(target, "@RG\tID:");
908 _streamPutInt(target, id + 1);
909 _streamWrite(target, "\tLB:");
910 _streamWrite(target, store.libraryNameStore[id]);
911 _streamWrite(target, "\tPI:");
912 _streamPutInt(target, (int)/*std::round*/(store.libraryStore[id].mean));
913 _streamWrite(target, "\tSM:none"); // sample name needs to be included into fragment store
914 _streamPut(target, '\n');
915 }
916 _streamWrite(target, "@PG\tID:SeqAn\n");
917 }
918
919
920 //////////////////////////////////////////////////////////////////////////////
921 // _writeAlignments
922
923 template<typename TFile, typename TSpec, typename TConfig>
_writeAlignments(TFile & target,FragmentStore<TSpec,TConfig> & store,Sam)924 inline void _writeAlignments(TFile & target,
925 FragmentStore<TSpec, TConfig> & store,
926 Sam)
927 {
928 typedef FragmentStore<TSpec, TConfig> TFragmentStore;
929
930 typedef typename TFragmentStore::TReadStore TReadStore;
931 typedef typename TFragmentStore::TReadSeqStore TReadSeqStore;
932 typedef typename TFragmentStore::TAlignedReadStore TAlignedReadStore;
933 typedef typename TFragmentStore::TContigStore TContigStore;
934 typedef typename TFragmentStore::TReadSeq TReadSeq;
935
936 typedef typename Value<TReadStore>::Type TRead;
937 typedef typename Value<TReadSeqStore>::Type TReadSeqStored;
938 typedef typename Value<TContigStore>::Type TContig;
939 typedef typename Value<TAlignedReadStore>::Type TAlignedRead;
940
941 typedef typename TContig::TContigSeq TContigSeq;
942 typedef typename Iterator<TAlignedReadStore, Standard>::Type TAlignIter;
943 typedef typename Iterator<TReadSeqStored, Standard>::Type TReadSeqIter;
944 typedef typename Id<TAlignedRead>::Type TId;
945
946 typedef Gaps<TReadSeq, AnchorGaps<typename TAlignedRead::TGapAnchors> > TReadGaps;
947 typedef Gaps<Nothing, AnchorGaps<typename TContig::TGapAnchors> > TContigGaps;
948
949 String<int> mateIndex; // store outer library size for each pair match (indexed by pairMatchId)
950 calculateMateIndices(mateIndex, store);
951
952 TAlignIter it = begin(store.alignedReadStore, Standard());
953 TAlignIter itEnd = end(store.alignedReadStore, Standard());
954 TAlignIter mit = it;
955 CharString cigar;
956 TReadSeq readSeq;
957
958 // int i = 0;
959 for(; it != itEnd; ++it)
960 {
961 // std::cerr << "alignment " << i++ << std::endl;
962 TId alignedId = (*it).id;
963 TId readId = (*it).readId;
964 TId mateIdx = TRead::INVALID_ID;
965
966 if ((*it).pairMatchId != TRead::INVALID_ID)
967 mateIdx = mateIndex[2*(*it).pairMatchId + getMateNo(store, (*it).readId)];
968
969 TContigGaps contigGaps(/*store.contigStore[(*it).contigId].seq, */store.contigStore[(*it).contigId].gaps);
970 __int64 pos = positionGapToSeq(contigGaps, _min((*it).beginPos, (*it).endPos)) + 1;
971 __int64 mpos = 0;
972 int isize = 0;
973 unsigned short flag = 0;
974
975 if ((*it).beginPos > (*it).endPos)
976 flag |= 0x0010;
977
978 // calculate flags, mpos, isize
979 if (mateIdx < length(store.alignedReadStore))
980 {
981 mit = begin(store.alignedReadStore, Standard()) + mateIdx;
982 if ((*it).contigId == (*mit).contigId)
983 {
984 mpos = positionGapToSeq(contigGaps, _min((*mit).beginPos, (*mit).endPos)) + 1;
985 if ((*it).beginPos < (*mit).beginPos)
986 isize = positionGapToSeq(contigGaps, _max((*mit).beginPos, (*mit).endPos) - 1) + 2 - pos;
987 else
988 isize = mpos - positionGapToSeq(contigGaps, _max((*it).beginPos, (*it).endPos) - 1) - 2;
989 }
990 flag |= 0x0002;
991 if ((*mit).beginPos > (*mit).endPos)
992 flag |= 0x0020;
993 }
994 else
995 flag |= 0x0008; // mate is unmapped (actually we should check if the mate has no match at all)
996
997 signed char mateNo = getMateNo(store, readId);
998 if (mateNo == 0) flag |= 0x0040; // this read is the first in the pair
999 if (mateNo == 1) flag |= 0x0080; // this read is the second in the pair
1000
1001 if (readId < length(store.readStore))
1002 {
1003 TRead &read = store.readStore[readId];
1004 if (read.matePairId != TRead::INVALID_ID)
1005 flag |= 0x0001;
1006 }
1007
1008 // <qname>
1009 if (readId < length(store.readNameStore)) {
1010 typedef typename Iterator<CharString, Standard>::Type TCharStringIterator;
1011 for (TCharStringIterator it = begin(store.readNameStore[readId]); it != end(store.readNameStore[readId]); ++it) {
1012 if (*it == ' ' || *it == '\t' || *it == '\n' || *it == '\r')
1013 break;
1014 _streamPut(target, *it);
1015 }
1016 }
1017 _streamPut(target, '\t');
1018
1019 // <flag>
1020 _streamPutInt(target, flag);
1021 _streamPut(target, '\t');
1022
1023 // <rname>
1024 if ((*it).contigId < length(store.contigNameStore))
1025 _streamWrite(target, store.contigNameStore[(*it).contigId]);
1026 else
1027 _streamWrite(target, '.'); // No reference name given. Standard says field must not be empty but gives no "NULL" value.
1028 _streamPut(target, '\t');
1029
1030 // <pos>
1031 _streamPutInt(target, pos);
1032 _streamPut(target, '\t');
1033
1034 // <mapq>
1035 if (alignedId < length(store.alignQualityStore))
1036 _streamPutInt(target, store.alignQualityStore[alignedId].score);
1037 else
1038 _streamPutInt(target, 255);
1039 _streamPut(target, '\t');
1040
1041 // get read sequence
1042 if (readId < length(store.readSeqStore))
1043 {
1044 readSeq = store.readSeqStore[readId];
1045 if ((*it).beginPos <= (*it).endPos)
1046 {
1047 setBeginPosition(contigGaps, (*it).beginPos);
1048 setEndPosition(contigGaps, (*it).endPos);
1049 } else
1050 {
1051 setBeginPosition(contigGaps, (*it).endPos);
1052 setEndPosition(contigGaps, (*it).beginPos);
1053 reverseComplement(readSeq);
1054 }
1055 } else
1056 clear(readSeq);
1057
1058 // <cigar>
1059 TReadGaps readGaps(readSeq, (*it).gaps);
1060 typedef Gaps<TContigSeq, AnchorGaps<typename TContig::TGapAnchors> > TContigGaps2;
1061 // TContigGaps contigGaps2(store.contigStore[(*it).contigId].seq, store.contigStore[(*it).contigId].gaps);
1062 // if (i == 4)
1063 // printf("It's it!\n");
1064 // std::cerr << "read gaps: " << readGaps << std::endl;
1065 // std::cerr << "contig gaps:" << contigGaps2 << std::endl;
1066 getCigarString(cigar, contigGaps, readGaps);
1067
1068 _streamWrite(target, cigar);
1069 _streamPut(target, '\t');
1070
1071 // <mrnm>
1072 if ((mateIdx < length(store.alignedReadStore)))
1073 {
1074 if ((*it).contigId == (*mit).contigId)
1075 _streamWrite(target, '=');
1076 else
1077 if ((*mit).contigId < length(store.contigNameStore))
1078 _streamWrite(target, store.contigNameStore[(*mit).contigId]);
1079 } else
1080 _streamWrite(target, '*');
1081
1082 _streamPut(target, '\t');
1083
1084 // <mpos>
1085 _streamPutInt(target, (int)mpos);
1086 _streamPut(target, '\t');
1087
1088 // <isize>
1089 _streamPutInt(target, isize);
1090 _streamPut(target, '\t');
1091
1092 // <seq>
1093 _streamWrite(target, readSeq);
1094 _streamPut(target, '\t');
1095
1096 // <qual>
1097 TReadSeqIter it = begin(store.readSeqStore[readId], Standard());
1098 TReadSeqIter itEnd = end(store.readSeqStore[readId], Standard());
1099 for (; it != itEnd; ++it)
1100 _streamPut(target, (char)(getQualityValue(*it) + 33));
1101
1102 // <tags>
1103 if (alignedId < length(store.alignedReadTagStore) && !empty(store.alignedReadTagStore[alignedId]))
1104 {
1105 _streamPut(target, '\t');
1106 _streamWrite(target, store.alignedReadTagStore[alignedId]);
1107 }
1108
1109 _streamPut(target, '\n');
1110 }
1111
1112 }
1113
1114 //////////////////////////////////////////////////////////////////////////////
1115 // write
1116
1117 ///.Function.write.param.tag.type:Tag.File Format.tag.Sam
1118
1119 template<typename TFile, typename TSpec, typename TConfig>
write(TFile & target,FragmentStore<TSpec,TConfig> & store,Sam)1120 inline void write(TFile & target,
1121 FragmentStore<TSpec, TConfig> & store,
1122 Sam)
1123 {
1124 // write header
1125 _writeHeader(target, store, Sam());
1126
1127 // write aligments
1128 _writeAlignments(target, store, Sam());
1129 }
1130
1131 }// namespace SEQAN_NAMESPACE_MAIN
1132
1133 #endif //#ifndef SEQAN_HEADER_...
1134