1 // ==========================================================================
2 //                 SeqAn - The Library for Sequence Analysis
3 // ==========================================================================
4 // Copyright (c) 2006-2010, Knut Reinert, FU Berlin
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are met:
9 //
10 //     * Redistributions of source code must retain the above copyright
11 //       notice, this list of conditions and the following disclaimer.
12 //     * Redistributions in binary form must reproduce the above copyright
13 //       notice, this list of conditions and the following disclaimer in the
14 //       documentation and/or other materials provided with the distribution.
15 //     * Neither the name of Knut Reinert or the FU Berlin nor the names of
16 //       its contributors may be used to endorse or promote products derived
17 //       from this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29 // DAMAGE.
30 //
31 // ==========================================================================
32 // Author: David Weese <david.weese@fu-berlin.de>
33 // ==========================================================================
34 
35 #include <iostream>
36 
37 #ifndef SEQAN_HEADER_STORE_IO_SAM_H
38 #define SEQAN_HEADER_STORE_IO_SAM_H
39 
40 namespace SEQAN_NAMESPACE_MAIN
41 {
42 
43 //////////////////////////////////////////////////////////////////////////////
44 // File tags
45 //////////////////////////////////////////////////////////////////////////////
46 
47 /**
48 .Tag.File Format.tag.Sam:
49 	Sam mapping file.
50 ..include:seqan/store.h
51 */
52 struct Sam_;
53 typedef Tag<Sam_> const Sam;
54 
55 
56 //////////////////////////////////////////////////////////////////////////////
57 // CIGAR struct
58 //////////////////////////////////////////////////////////////////////////////
59 
60 #if SEQAN_HAS_SAMTOOLS
61     struct FromBam_;
62     typedef Tag<FromBam_> FromBam;
63 #endif  // #if SEQAN_HAS_SAMTOOLS
64 
65     template <typename TOperation_ = char, typename TCount_ = unsigned>
66 	struct CigarElement
67 	{
68 		typedef TOperation_ TOperation;
69 		typedef TCount_		TCount;
70 
71 		TOperation			operation;
72 		TCount				count;
73 
CigarElementCigarElement74 		CigarElement() : operation(0), count(0) {}
75 
CigarElementCigarElement76 		CigarElement(TOperation o, TCount c):
77 			operation(o),
78 			count(c) {}
79 
80 #if SEQAN_HAS_SAMTOOLS
CigarElementCigarElement81         CigarElement(__uint32 bamCigarElement, FromBam const &)
82         {
83             SEQAN_ASSERT_LEQ(bamCigarElement & BAM_CIGAR_MASK, 8u);
84             operation = "MIDNSHP=X"[bamCigarElement & BAM_CIGAR_MASK];
85             count = bamCigarElement >> 4;
86         }
87 #endif  // #if SEQAN_HAS_SAMTOOLS
88 	};
89 
90 template <typename TOperation, typename TCount>
toBamCigarElement(CigarElement<TOperation,TCount> const & cigarElement)91 __uint32 toBamCigarElement(CigarElement<TOperation, TCount> const & cigarElement)
92 {
93     char operation = 0;
94     switch (cigarElement.operation) {
95         case 'X': operation += 1;
96         case '=': operation += 1;
97         case 'P': operation += 1;
98         case 'H': operation += 1;
99         case 'S': operation += 1;
100         case 'N': operation += 1;
101         case 'D': operation += 1;
102         case 'I': operation += 1;
103         case 'M': break;
104     }
105     return (cigarElement.count << 4) | operation;
106 }
107 
108 //____________________________________________________________________________
109 
110 template <
111 	typename TCigar,
112 	typename TGaps1,
113 	typename TGaps2,
114 	typename TThresh>
115 inline void
getCigarString(TCigar & cigar,TGaps1 & gaps1,TGaps2 & gaps2,TThresh splicedGapThresh)116 getCigarString(
117 	TCigar &cigar,
118 	TGaps1 &gaps1,
119 	TGaps2 &gaps2,
120 	TThresh splicedGapThresh)
121 {
122 	typename Iterator<TGaps1>::Type it1 = begin(gaps1);
123 	typename Iterator<TGaps2>::Type it2 = begin(gaps2);
124 	clear(cigar);
125 	char op, lastOp = ' ';
126 	unsigned numOps = 0;
127 
128 //	std::cout << gaps1 << std::endl;
129 //	std::cout << gaps2 << std::endl;
130 	for (; !atEnd(it1) && !atEnd(it2); goNext(it1), goNext(it2))
131 	{
132 		if (isGap(it1))
133 		{
134 			if (isGap(it2))
135 				op = 'P';
136 			else if (isClipped(it2))
137 				op = '?';
138 			else
139 				op = 'I';
140 		}
141 		else if (isClipped(it1))
142 		{
143 			op = '?';
144 		}
145 		else
146 		{
147 			if (isGap(it2))
148 				op = 'D';
149 			else if (isClipped(it2))
150 				op = 'S';
151 			else
152 				op = 'M';
153 		}
154 		if (lastOp != op)
155 		{
156 			if (lastOp == 'D' && numOps >= (unsigned)splicedGapThresh)
157 				lastOp = 'N';
158 			if (numOps > 0)
159 			{
160 				std::stringstream num;
161 				num << numOps;
162 				append(cigar, num.str());
163 				appendValue(cigar, lastOp);
164 			}
165 			numOps = 0;
166 			lastOp = op;
167 		}
168 		++numOps;
169 	}
170 	SEQAN_ASSERT_EQ(atEnd(it1), atEnd(it2));
171 	if (lastOp == 'D' && numOps >= (unsigned)splicedGapThresh)
172 		lastOp = 'N';
173 	if (numOps > 0)
174 	{
175 		std::stringstream num;
176 		num << numOps;
177 		append(cigar, num.str());
178 		appendValue(cigar, lastOp);
179 	}
180 }
181 
182 template <
183 	typename TCigar,
184 	typename TGaps1,
185 	typename TGaps2>
186 inline void
getCigarString(TCigar & cigar,TGaps1 & gaps1,TGaps2 & gaps2)187 getCigarString(
188 	TCigar &cigar,
189 	TGaps1 &gaps1,
190 	TGaps2 &gaps2)
191 {
192 	return getCigarString(cigar, gaps1, gaps2, 20);
193 }
194 
195 template <
196     typename TOperation,
197     typename TCount,
198 	typename TSpec,
199 	typename TGaps1,
200 	typename TGaps2,
201 	typename TThresh>
202 inline void
getCigarString(String<CigarElement<TOperation,TCount>,TSpec> & cigar,TGaps1 & gaps1,TGaps2 & gaps2,TThresh splicedGapThresh)203 getCigarString(
204         String<CigarElement<TOperation, TCount>, TSpec> &cigar,
205         TGaps1 &gaps1,
206         TGaps2 &gaps2,
207         TThresh splicedGapThresh)
208 {
209 	typename Iterator<TGaps1>::Type it1 = begin(gaps1);
210 	typename Iterator<TGaps2>::Type it2 = begin(gaps2);
211 	clear(cigar);
212 	char op, lastOp = ' ';
213 	unsigned numOps = 0;
214 
215 //	std::cout << gaps1 << std::endl;
216 //	std::cout << gaps2 << std::endl;
217 	for (; !atEnd(it1) && !atEnd(it2); goNext(it1), goNext(it2))
218 	{
219 		if (isGap(it1))
220 		{
221 			if (isGap(it2))
222 				op = 'P';
223 			else if (isClipped(it2))
224 				op = '?';
225 			else
226 				op = 'I';
227 		}
228 		else if (isClipped(it1))
229 		{
230 			op = '?';
231 		}
232 		else
233 		{
234 			if (isGap(it2))
235 				op = 'D';
236 			else if (isClipped(it2))
237 				op = 'S';
238 			else
239 				op = 'M';
240 		}
241 		if (lastOp != op)
242 		{
243 			if (lastOp == 'D' && numOps >= (unsigned)splicedGapThresh)
244 				lastOp = 'N';
245 			if (numOps > 0)
246 				appendValue(cigar, CigarElement<>(op, numOps));
247 			numOps = 0;
248 			lastOp = op;
249 		}
250 		++numOps;
251 	}
252 	SEQAN_ASSERT_EQ(atEnd(it1), atEnd(it2));
253 	if (lastOp == 'D' && numOps >= (unsigned)splicedGapThresh)
254 		lastOp = 'N';
255 	if (numOps > 0)
256         appendValue(cigar, CigarElement<>(op, numOps));
257 }
258 
259 //////////////////////////////////////////////////////////////////////////////
260 // _getClippedLength
261 
262     template <typename TCigarString, typename TNum>
_getClippedLength(TCigarString const & cigar,TNum & sum)263     inline void _getClippedLength(TCigarString const & cigar, TNum & sum)
264     {
265         typedef typename Iterator<TCigarString, Standard>::Type TCigarIter;
266 
267         TCigarIter it = begin(cigar, Standard());
268         TCigarIter itEnd = end(cigar, Standard());
269 
270         sum = 0;
271         for (; it != itEnd; ++it)
272             if (getValue(it).operation != 'S' && getValue(it).operation != 'H')
273                 sum += getValue(it).count;
274     }
275 
276 //////////////////////////////////////////////////////////////////////////////
277 // convert CIGAR to gaps
278 
279     template<typename TCigarString, typename TGaps>
cigarToGapAnchorRead(TCigarString const & cigar,TGaps & gaps)280     inline void cigarToGapAnchorRead(TCigarString const & cigar, TGaps & gaps)
281     {
282 		typename Iterator<TGaps>::Type it = begin(gaps);
283 		for (unsigned i = 0; i < length(cigar); ++i)
284 		{
285 			switch (cigar[i].operation)
286 			{
287 				case 'D':
288 				case 'N':
289 				case 'P':
290 					insertGaps(it, cigar[i].count);
291 				case 'I':
292 				case 'M':
293         case 'S':
294 					it += cigar[i].count;
295 			}
296 		}
297 	}
298 
299     template<typename TCigarString, typename TGaps>
cigarToGapAnchorContig(TCigarString const & cigar,TGaps & gaps)300     inline void cigarToGapAnchorContig(TCigarString const & cigar, TGaps & gaps)
301     {
302 		typename Iterator<TGaps>::Type it = begin(gaps);
303 		for (unsigned i = 0; i < length(cigar); ++i)
304 		{
305 			switch (cigar[i].operation)
306 			{
307 				case 'I':
308 				case 'P':
309 					insertGaps(it, cigar[i].count);
310 				case 'D':
311 				case 'M':
312 				case 'N':
313         case 'S':
314 					it += cigar[i].count;
315 			}
316 		}
317 	}
318 
319 
320 
321 //////////////////////////////////////////////////////////////////////////////
322 // Parsing functions
323 //////////////////////////////////////////////////////////////////////////////
324 
325 
326 //////////////////////////////////////////////////////////////////////////////
327 // _parseReadCigar
328 
329     template <typename TFile, typename TCigarString, typename TChar>
330     inline void
_parseReadCigar(TFile & file,TCigarString & cigar,TChar & c)331     _parseReadCigar(TFile & file, TCigarString & cigar, TChar & c)
332     {
333 		typedef typename Value<TCigarString>::Type	TCigarElement;
334 		typedef typename TCigarElement::TOperation	TOperation;
335 		typedef typename TCigarElement::TCount		TCount;
336 
337 		clear(cigar);
338 
339         // if the CIGAR is not set and '*'
340         if (c == '*')
341 		{
342             c = _streamGet(file);
343             return;
344         }
345 
346         while (!_streamEOF(file))
347 		{
348             TCount count = _parseReadNumber(file, c);
349 			if (c >= 'a' && c <= 'z')
350 				c = c + 'A' - 'a';
351             appendValue(cigar, TCigarElement(c, count));
352 
353             c = _streamGet(file);
354             if (c == ' ' || c == '\t' || c == '\n') break;
355         }
356     }
357 
358 //////////////////////////////////////////////////////////////////////////////
359 // _parseReadSamIdentifier
360 
361     template<typename TFile, typename TString, typename TChar>
362     inline void
_parseReadSamIdentifier(TFile & file,TString & str,TChar & c)363     _parseReadSamIdentifier(TFile & file, TString & str, TChar& c)
364     {
365         if (c == ' ' || c == '\t' || c == '\n') return;
366         appendValue(str, c);
367         while (!_streamEOF(file))
368 		{
369             c = _streamGet(file);
370             if (c == ' ' || c == '\t' || c == '\n') return;
371             appendValue(str, c);
372         }
373     }
374 
375 //////////////////////////////////////////////////////////////////////////////
376 // _parseIsDna
377 
378     template<typename TChar>
379     inline bool
_parseIsDna(TChar const & c)380     _parseIsDna(TChar const & c)
381     {
382         char x = TChar(Dna5(c));
383         return (c == x) || (c + 'A' - 'a' == x);
384     }
385 
386 //////////////////////////////////////////////////////////////////////////////
387 //_parseReadDnaSeq
388 
389     template<typename TFile, typename TString, typename TChar>
390     inline void
_parseReadDnaSeq(TFile & file,TString & str,TChar & c)391     _parseReadDnaSeq(TFile & file, TString & str, TChar & c)
392     {
393 		TChar first = c;
394 		if (!_streamEOF(file))
395 			c = _streamGet(file);
396 
397         if (!_parseIsDna(first))
398 			return;
399         appendValue(str, first, Generous());
400 
401         for (; !_streamEOF(file) && _parseIsDna(c); c = _streamGet(file))
402             appendValue(str, c, Generous());
403     }
404 
405 //////////////////////////////////////////////////////////////////////////////
406 // _parseIsPhredQual
407 
408     template <typename TChar>
409     inline bool
_parseIsPhredQual(TChar c)410     _parseIsPhredQual(TChar c)
411     {
412         return c >= '!' && c <= '~';
413     }
414 
415 //////////////////////////////////////////////////////////////////////////////
416 // _parseReadSeqQual
417 //
418 
419     template<typename TFile, typename TQualString, typename TChar>
420     inline void
_parseReadSeqQual(TFile & file,TQualString & str,TChar & c)421     _parseReadSeqQual(TFile & file, TQualString & str, TChar & c)
422     {
423         typedef typename Size<TQualString>::Type				TSize;
424         typedef typename Iterator<TQualString, Standard>::Type	TIter;
425 
426         TIter itBegin = begin(str, Standard());
427         TSize rest = length(str);
428 
429         int q = 0;
430         for (TIter it = itBegin; rest != 0 && _parseIsPhredQual(c); --rest, ++it)
431         {
432             q = c - '!';
433 			if (!_streamEOF(file))
434 				c = _streamGet(file);
435 			else
436 				if (rest > 1)
437 					rest = 1;
438 
439 			if (q == '*' - '!' && !_parseIsPhredQual(c) && it == itBegin)
440 				return;
441 
442             assignQualityValue(*it, q);
443         }
444     }
445 
446 //////////////////////////////////////////////////////////////////////////////
447 // _parseReadCharsUntilEndOfLine
448 //
449 // Reads all symbols till the next '\n' and writes them in the CharString str
450 // the c is the first character after the '\n'.
451 
452     template<typename TFile, typename TChar>
453     inline void
_parseReadCharsUntilEndOfLine(TFile & file,String<char> & str,TChar & c)454     _parseReadCharsUntilEndOfLine(TFile & file, String<char> & str, TChar& c)
455     {
456         // read all chars till '\n'
457         while (c != '\n')
458         {
459             appendValue(str, c, Generous());
460 			if (_streamEOF(file)) return;
461 	        c = _streamGet(file);
462         }
463 
464         // read the first char after the '\n'
465 		if (!_streamEOF(file))
466 	        c = _streamGet(file);
467     }
468 
469 //////////////////////////////////////////////////////////////////////////////
470 // appendAlignment
471 
472     template<typename TSpec, typename TConfig, typename TId, typename TPos, typename TGaps>
473     inline typename Size<typename FragmentStore<TSpec, TConfig>::TAlignedReadStore>::Type
appendAlignment(FragmentStore<TSpec,TConfig> & fragStore,TId readId,TId contigId,TPos beginPos,TPos endPos,TGaps const & gaps)474 	appendAlignment(
475 		FragmentStore<TSpec, TConfig> & fragStore,
476 		TId readId,
477 		TId contigId,
478 		TPos beginPos,
479 		TPos endPos,
480 		TGaps const & gaps)
481 	{
482         typedef FragmentStore<TSpec, TConfig> TFragmentStore;
483         typedef typename Value<typename TFragmentStore::TAlignedReadStore>::Type TAlignedElement;
484 
485         TId id = length(fragStore.alignedReadStore);
486         TAlignedElement alignedElem = TAlignedElement(id, readId, contigId, beginPos, endPos, gaps);
487         appendValue(fragStore.alignedReadStore, alignedElem);
488 
489 		return id;
490     }
491 
492 
493 
494 //////////////////////////////////////////////////////////////////////////////
495 // read functions for Sam
496 //////////////////////////////////////////////////////////////////////////////
497 
498 
499 //////////////////////////////////////////////////////////////////////////////
500 // _generatePairMatchIds
501 //
502 	template <typename TPos, typename TId>
503 	struct MatchMateInfo_
504 	{
505 		TId		readId;
506 		TId		contigId;
507 		TId		pairMatchId;
508 		TPos	beginPos;
509 	};
510 
511     struct MatchMateInfoLess_
512 	{
513         template <typename TMInfo>
514         inline bool
operatorMatchMateInfoLess_515         operator() (TMInfo const &a, TMInfo const &b) const
516 		{
517             return (a.contigId < b.contigId) || (a.contigId == b.contigId && a.beginPos < b.beginPos);
518         }
519     };
520 
521     template<typename TSpec, typename TConfig, typename TMatchMateInfos>
522     inline void
_generatePairMatchIds(FragmentStore<TSpec,TConfig> & fragStore,TMatchMateInfos & matchMateInfos)523 	_generatePairMatchIds (
524 		FragmentStore<TSpec, TConfig> & fragStore,
525 		TMatchMateInfos & matchMateInfos)
526     {
527         typedef FragmentStore<TSpec, TConfig> TFragmentStore;
528 
529         typedef typename TFragmentStore::TReadStore			TReadStore;
530         typedef typename TFragmentStore::TAlignedReadStore	TAlignedReadStore;
531         typedef typename TFragmentStore::TContigPos			TContigPos;
532 
533         typedef typename Value<TReadStore>::Type						TRead;
534         typedef typename Value<TAlignedReadStore>::Type					TAlignedRead;
535         typedef typename Iterator<TAlignedReadStore, Standard>::Type	TIter;
536 		typedef typename Iterator<TMatchMateInfos, Standard>::Type		TMIter;
537         typedef typename Id<TFragmentStore>::Type						TId;
538 
539         TIter it = begin(fragStore.alignedReadStore, Standard());
540 		TIter itEnd = end(fragStore.alignedReadStore, Standard());
541 		TMIter mit = begin(matchMateInfos, Standard());
542 		TMIter mitEnd = end(matchMateInfos, Standard());
543 
544 		if (it == itEnd || mit == mitEnd) return;
545 
546         // sort the aligned read store by: begin position, contig name
547         sortAlignedReads(fragStore.alignedReadStore, SortBeginPos());
548         sortAlignedReads(fragStore.alignedReadStore, SortContigId());
549 		std::sort(mit, mitEnd, MatchMateInfoLess_());
550 
551 		TMIter mitNext = mit;
552 		while (mitNext != mitEnd && (*mit).beginPos == (*mitNext).beginPos)
553 			++mitNext;
554 
555 		TContigPos pos = _min((*it).beginPos, (*it).endPos);
556 		typename Size<TReadStore>::Type readStoreLength = length(fragStore.readStore);
557 
558 		while (mit != mitEnd)
559 		{
560 			int cmp = 0;
561 			if ((*it).contigId < (*mit).contigId) cmp = -1;
562 			else if ((*it).contigId > (*mit).contigId) cmp = 1;
563 			else if (pos < (*mit).beginPos) cmp = -1;
564 			else if (pos > (*mit).beginPos) cmp = 1;
565 
566 			if (cmp == 1)
567 			{
568 				mit = mitNext;
569 				while (mitNext != mitEnd && (*mit).beginPos == (*mitNext).beginPos)
570 					++mitNext;
571 				continue;
572 			}
573 			if (cmp == 0)
574 			{
575 				for (TMIter m = mit; m != mitNext; ++m)
576 					if ((*it).readId != (*m).readId && (*it).readId < readStoreLength && (*m).readId < readStoreLength)		// correct position found
577 						if (fragStore.readStore[(*it).readId].matePairId == fragStore.readStore[(*m).readId].matePairId)	// correct mate found
578 							(*it).pairMatchId = (*m).pairMatchId;															// link mate
579 			}
580 			if (++it == itEnd) break;
581 			pos = _min((*it).beginPos, (*it).endPos);
582 		}
583     }
584 
585 //////////////////////////////////////////////////////////////////////////////
586 // read
587 
588 ///.Function.read.param.tag.type:Tag.File Format.tag.Sam
589 
590     template<typename TFile, typename TSpec, typename TConfig>
591     inline void
read(TFile & file,FragmentStore<TSpec,TConfig> & fragStore,Sam)592     read(
593 		TFile & file,
594 		FragmentStore<TSpec, TConfig> & fragStore,
595 		Sam)
596     {
597         typedef Value<FILE>::Type TValue;
598         typedef FragmentStore<TSpec, TConfig> TFragmentStore;
599 		typedef typename TFragmentStore::TContigPos TContigPos;
600         typedef typename Id<TFragmentStore>::Type TId;
601 
602         // data structure to temporarily store the gaps that need to be inserted in the contig sequences
603         typedef MatchMateInfo_<TContigPos, TId> TMatchMateInfo;
604         typedef String<TMatchMateInfo> TMatchMateInfos;
605         typedef StringSet<String<typename TFragmentStore::TContigGapAnchor> > TContigAnchorGaps;
606 
607         // data structure to temporarily store information about match mates
608         TMatchMateInfos matchMateInfos;
609 		TContigAnchorGaps contigAnchorGaps;
610 
611         if (_streamEOF(file)) return;
612 
613 		// get first character from the stream
614         char c = _streamGet(file);
615 
616         // Read in header section
617         _readHeader(file, fragStore, c, Sam());
618 
619         // Read in alignments section
620         _readAlignments(file, fragStore, contigAnchorGaps, matchMateInfos, c, Sam());
621 
622         // set the match mate IDs using the information stored in matchMateInfos
623         _generatePairMatchIds(fragStore, matchMateInfos);
624 
625 		convertPairWiseToGlobalAlignment(fragStore, contigAnchorGaps);
626     }
627 
628 //////////////////////////////////////////////////////////////////////////////
629 // _readHeader
630 
631     template<typename TFile, typename TSpec, typename TConfig, typename TChar>
632     inline void
_readHeader(TFile & file,FragmentStore<TSpec,TConfig> &,TChar & c,Sam)633     _readHeader (
634 		TFile & file,
635 		FragmentStore<TSpec, TConfig> &,
636 		TChar & c,
637 		Sam)
638     {
639         // skip header for now
640         while (c == '@')
641             _parseSkipLine(file, c);
642     }
643 
644 //////////////////////////////////////////////////////////////////////////////
645 // _readAlignments
646 //
647 // reads in alignement sections from a Sam file
648 
649     template<typename TFile, typename TSpec, typename TConfig, typename TContigAnchorGaps, typename TMatchMateInfos, typename TChar>
650     inline void
_readAlignments(TFile & file,FragmentStore<TSpec,TConfig> & fragStore,TContigAnchorGaps & contigAnchorGaps,TMatchMateInfos & matchMateInfos,TChar & c,Sam)651     _readAlignments (
652 		TFile & file,
653         FragmentStore<TSpec, TConfig> & fragStore,
654         TContigAnchorGaps & contigAnchorGaps,
655         TMatchMateInfos & matchMateInfos,
656         TChar & c,
657         Sam)
658     {
659         // create dummy entries in Sam specific aligned read quality store and aligned read tag store
660         // is needed so the ID in the aligned store can be use to access the other stores
661         // even if there exists previous entries without
662 		typedef FragmentStore<TSpec, TConfig> TFragmentStore;
663 		typedef typename TFragmentStore::TAlignQualityStore TAlignQualityStore;
664 		typedef typename TFragmentStore::TNameStore TNameStore;
665 		typedef typename Value<TAlignQualityStore>::Type TAlignQuality;
666 
667 		TAlignQuality q;
668 		q.score = maxValue(q.score);
669         int diff = length(fragStore.alignedReadStore) - length(fragStore.alignQualityStore);
670         for(int i = 0; i < diff; ++i)
671             appendValue(fragStore.alignQualityStore, q, Generous());
672 
673         diff = length(fragStore.alignedReadStore) - length(fragStore.alignedReadTagStore);
674         for(int i = 0; i < diff; ++i)
675             appendValue(fragStore.alignedReadTagStore, "", Generous());
676 
677         // read in alignments
678 		Nothing contextSAM;
679 		refresh(fragStore.contigNameStoreCache);
680 		refresh(fragStore.readNameStoreCache);
681 
682         while (!_streamEOF(file))
683             _readOneAlignment(file, fragStore, contigAnchorGaps, matchMateInfos, c, Sam(), contextSAM);
684     }
685 
686 
687 //////////////////////////////////////////////////////////////////////////////
688 // _readOneAlignment
689 //
690 // reads in one alignement section from a Sam file
691 
692     template<typename TFile, typename TSpec, typename TConfig, typename TContigAnchorGaps, typename TMatchMateInfos, typename TChar, typename TContextSAM>
693     inline void
_readOneAlignment(TFile & file,FragmentStore<TSpec,TConfig> & fragStore,TContigAnchorGaps & contigAnchorGaps,TMatchMateInfos & matchMateInfos,TChar & c,Sam,TContextSAM & contextSAM)694     _readOneAlignment (
695 		TFile & file,
696 		FragmentStore<TSpec, TConfig> & fragStore,
697 		TContigAnchorGaps & contigAnchorGaps,
698 		TMatchMateInfos & matchMateInfos,
699 		TChar & c,
700 		Sam,
701 		TContextSAM & contextSAM)
702     {
703         // Basic types
704         typedef FragmentStore<TSpec, TConfig>										TFragmentStore;
705         typedef typename Id<TFragmentStore>::Type									TId;
706         typedef typename Size<TFragmentStore>::Type									TSize;
707 
708         // All fragment store element types
709         typedef typename Value<typename TFragmentStore::TContigStore>::Type			TContigElement;
710         typedef typename Value<typename TFragmentStore::TLibraryStore>::Type		TLibraryStoreElement;
711         typedef typename Value<typename TFragmentStore::TMatePairStore>::Type		TMatePairElement;
712         typedef typename Value<typename TFragmentStore::TReadStore>::Type			TReadStoreElement;
713         typedef typename Value<typename TFragmentStore::TAlignedReadStore>::Type	TAlignedElement;
714         typedef typename Value<typename TFragmentStore::TAlignQualityStore>::Type	TAlignQualityElement;
715         typedef typename TAlignedElement::TGapAnchors								TReadGapAnchors;
716 
717         // Type for sequence in readstore
718         typedef typename TFragmentStore::TReadSeq TReadSeq2;
719 
720         // Type for gap anchor
721         typedef typename TFragmentStore::TContigPos									TContigPos;
722 		typedef Gaps<TReadSeq2, AnchorGaps<TReadGapAnchors> >						TReadGaps;
723 		typedef Gaps<Nothing, AnchorGaps<typename Value<TContigAnchorGaps>::Type> >	TContigGapsPW;
724 
725         // Type to temporarily store information about match mates
726         typedef typename Value<TMatchMateInfos>::Type								TMatchMateInfo;
727 
728         // read fields of alignments line
729         _parseSkipWhitespace(file, c);
730 
731 		// Read the query name.  The letters until the first
732 		// whitespace will be read into qname.  Then, we skip until we
733 		// hit the first tab character.
734         String<char> qname;
735         _parseReadSamIdentifier(file, qname, c);
736         _parseSkipUntilChar(file, '\t', c);
737 
738 		// read the flag
739         int flag;
740         flag = _parseReadNumber(file, c);
741         _parseSkipWhitespace(file, c);
742 		bool reverse = (flag & (1 << 4)) == (1 << 4);
743 
744 		// Read reference name.  Same behaviour as for query name:  Read up to
745         // the first whitespace character and skip to next tab char.
746         String<char> rname;
747         _parseReadSamIdentifier(file, rname, c);
748         _parseSkipUntilChar(file, '\t', c);
749 
750 		// read begin position
751         TContigPos beginPos;
752         beginPos = _parseReadNumber(file, c);
753         --beginPos; // Sam stores positions starting at 1 the fragment store starting at 0
754         _parseSkipWhitespace(file, c);
755 
756         // read map quality
757         TAlignQualityElement mapQ;
758         mapQ.score = _parseReadNumber(file, c);
759         _parseSkipWhitespace(file, c);
760 
761 		// read CIGAR
762         String<CigarElement<> > cigar;
763         _parseReadCigar(file, cigar, c);
764         _parseSkipWhitespace(file, c);
765 
766         // calculate the end position
767         TContigPos endPos;
768         _getClippedLength(cigar, endPos);
769         endPos = beginPos + endPos;
770         // if the read is on the antisense strand switch begin and end position
771         if (reverse)
772 		{
773             TContigPos temp = beginPos;
774             beginPos = endPos;
775             endPos = temp;
776         }
777 
778 		// generate gap anchor string for the read
779         TReadGapAnchors readGapAnchors;
780 
781         // read mate reference name
782         String<char> mrnm;
783         _parseReadSamIdentifier(file, mrnm, c);
784         _parseSkipWhitespace(file, c);
785 
786 		// read mate position
787         TContigPos mPos;
788         mPos = _parseReadNumber(file, c);
789         --mPos; // Sam stores positions starting at 1 the fragment store starting at 0
790         _parseSkipWhitespace(file, c);
791 
792 		// read iSize
793         _parseReadNumber(file, c);
794         _parseSkipWhitespace(file, c);
795 
796 		// read in sequence
797         TReadSeq2 readSeq;
798         _parseReadDnaSeq(file, readSeq, c);
799         SEQAN_ASSERT_GT(length(readSeq), 0u);
800 		if (reverse)  // TODO(holtgrew): Why not after parsing quality? ib. for BAM...
801 			reverseComplement(readSeq);
802         _parseSkipWhitespace(file, c);
803 
804 		// and associated qualities
805         _parseReadSeqQual(file, readSeq, c);
806 
807 		// insert alignment gaps
808 		TReadGaps readGaps(readSeq, readGapAnchors);
809         cigarToGapAnchorRead(cigar, readGaps);
810 
811         // read in Sam tags
812         String<char> tags;
813         _parseSkipSpace(file, c);
814         _parseReadCharsUntilEndOfLine(file, tags, c);
815 
816 		if (empty(qname) || empty(rname))
817 			return;
818 
819         // check if read sequence is already in the store.
820         // if so get the ID, otherwise create new entries in the
821         // read, read name and mate pair store
822 
823         TId readId = 0;
824         _storeAppendRead(fragStore, readId, qname, readSeq, flag, contextSAM);
825 
826         // check if the contig is already in the store
827         // get its ID or create a new one otherwise
828         TId contigId = 0;
829         _storeAppendContig(fragStore, contigId, rname);
830 
831 		if (empty(cigar)) return;
832 
833         // create a new entry in the aligned read store
834         TId pairMatchId = appendAlignment(fragStore, readId, contigId, beginPos, endPos, readGapAnchors);
835 		resize(contigAnchorGaps, length(fragStore.alignedReadStore), Generous());
836 		TContigGapsPW contigGaps(back(contigAnchorGaps));
837         cigarToGapAnchorContig(cigar, contigGaps);
838 
839 		// create entries in Sam specific stores
840         appendValue(fragStore.alignQualityStore, mapQ, Generous());
841         appendValue(fragStore.alignedReadTagStore, tags, Generous());
842 
843         // store additional data about match mate temporarily
844         // used in the end of the read function to generate match mate IDs
845 		TId mcontigId = contigId;
846         if (mrnm != "*")
847 		{
848 			if (mrnm != "=")
849 				_storeAppendContig(fragStore, mcontigId, mrnm);
850 
851 			if (flag & 0x40)	// store mate info only for the first read in the pair
852 			{
853 				TMatchMateInfo matchMateInfo = {readId, mcontigId, pairMatchId, mPos};
854 				appendValue(matchMateInfos, matchMateInfo);
855 				back(fragStore.alignedReadStore).pairMatchId = pairMatchId;
856 			}
857 		}
858     }
859 
860 
861 //////////////////////////////////////////////////////////////////////////////
862 // write functions for Sam
863 //////////////////////////////////////////////////////////////////////////////
864 
865 //////////////////////////////////////////////////////////////////////////////
866 // _writeAlignments
867 
868     template<typename TFile, typename TSpec, typename TConfig>
_writeHeader(TFile & target,FragmentStore<TSpec,TConfig> & store,Sam)869 	inline void _writeHeader(TFile & target,
870                                  FragmentStore<TSpec, TConfig> & store,
871                                  Sam)
872     {
873 		typedef FragmentStore<TSpec, TConfig>							TFragmentStore;
874 		typedef typename TFragmentStore::TLibraryStore					TLibraryStore;
875 		typedef typename TFragmentStore::TContigStore					TContigStore;
876 		typedef typename TFragmentStore::TNameStore						TNameStore;
877 
878         typedef typename Value<TContigStore>::Type						TContig;
879         typedef typename Iterator<TLibraryStore, Standard>::Type		TLibraryIter;
880         typedef typename Iterator<TContigStore, Standard>::Type			TContigIter;
881         typedef typename Iterator<TNameStore, Standard>::Type			TContigNameIter;
882         typedef typename Id<TContig>::Type								TId;
883 
884         TContigIter it = begin(store.contigStore, Standard());
885         TContigIter itEnd = end(store.contigStore, Standard());
886 		TContigNameIter nit = begin(store.contigNameStore, Standard());
887 		TContigNameIter nitEnd = end(store.contigNameStore, Standard());
888 
889 		_streamWrite(target, "@HD\tVN:1.0\n");
890         for(; it != itEnd; ++it)
891 		{
892 			_streamWrite(target, "@SQ\tSN:");
893 			if (nit != nitEnd)
894 			{
895 				_streamWrite(target, *nit);
896 				++nit;
897 			}
898 			_streamWrite(target, "\tLN:");
899 			_streamPutInt(target, length((*it).seq));
900             _streamPut(target, '\n');
901 		}
902 
903 		TLibraryIter lit = begin(store.libraryStore, Standard());
904 		TLibraryIter litEnd = end(store.libraryStore, Standard());
905         for(TId id = 0; lit != litEnd; ++lit, ++id)
906 		{
907 			_streamWrite(target, "@RG\tID:");
908 			_streamPutInt(target, id + 1);
909 			_streamWrite(target, "\tLB:");
910 			_streamWrite(target, store.libraryNameStore[id]);
911 			_streamWrite(target, "\tPI:");
912 			_streamPutInt(target, (int)/*std::round*/(store.libraryStore[id].mean));
913 			_streamWrite(target, "\tSM:none");	// sample name needs to be included into fragment store
914             _streamPut(target, '\n');
915 		}
916 		_streamWrite(target, "@PG\tID:SeqAn\n");
917 	}
918 
919 
920 //////////////////////////////////////////////////////////////////////////////
921 // _writeAlignments
922 
923     template<typename TFile, typename TSpec, typename TConfig>
_writeAlignments(TFile & target,FragmentStore<TSpec,TConfig> & store,Sam)924 	inline void _writeAlignments(TFile & target,
925                                  FragmentStore<TSpec, TConfig> & store,
926                                  Sam)
927     {
928 		typedef FragmentStore<TSpec, TConfig>							TFragmentStore;
929 
930 		typedef typename TFragmentStore::TReadStore						TReadStore;
931 		typedef typename TFragmentStore::TReadSeqStore					TReadSeqStore;
932 		typedef typename TFragmentStore::TAlignedReadStore				TAlignedReadStore;
933 		typedef typename TFragmentStore::TContigStore					TContigStore;
934 		typedef typename TFragmentStore::TReadSeq						TReadSeq;
935 
936         typedef typename Value<TReadStore>::Type						TRead;
937         typedef typename Value<TReadSeqStore>::Type						TReadSeqStored;
938         typedef typename Value<TContigStore>::Type						TContig;
939         typedef typename Value<TAlignedReadStore>::Type					TAlignedRead;
940 
941 		typedef typename TContig::TContigSeq							TContigSeq;
942         typedef typename Iterator<TAlignedReadStore, Standard>::Type	TAlignIter;
943         typedef typename Iterator<TReadSeqStored, Standard>::Type		TReadSeqIter;
944         typedef typename Id<TAlignedRead>::Type							TId;
945 
946 		typedef Gaps<TReadSeq, AnchorGaps<typename TAlignedRead::TGapAnchors> >	TReadGaps;
947 		typedef Gaps<Nothing, AnchorGaps<typename TContig::TGapAnchors> >	TContigGaps;
948 
949 		String<int> mateIndex;	// store outer library size for each pair match (indexed by pairMatchId)
950 		calculateMateIndices(mateIndex, store);
951 
952         TAlignIter it = begin(store.alignedReadStore, Standard());
953         TAlignIter itEnd = end(store.alignedReadStore, Standard());
954 		TAlignIter mit = it;
955 		CharString cigar;
956 		TReadSeq readSeq;
957 
958 //        int i = 0;
959         for(; it != itEnd; ++it)
960 		{
961 //            std::cerr << "alignment " << i++ << std::endl;
962             TId alignedId = (*it).id;
963 			TId readId = (*it).readId;
964 			TId mateIdx = TRead::INVALID_ID;
965 
966 			if ((*it).pairMatchId != TRead::INVALID_ID)
967 				mateIdx = mateIndex[2*(*it).pairMatchId + getMateNo(store, (*it).readId)];
968 
969 			TContigGaps	contigGaps(/*store.contigStore[(*it).contigId].seq, */store.contigStore[(*it).contigId].gaps);
970 			__int64 pos = positionGapToSeq(contigGaps, _min((*it).beginPos, (*it).endPos)) + 1;
971 			__int64 mpos = 0;
972 			int isize = 0;
973 			unsigned short flag = 0;
974 
975 			if ((*it).beginPos > (*it).endPos)
976 				flag |= 0x0010;
977 
978 			// calculate flags, mpos, isize
979 			if (mateIdx < length(store.alignedReadStore))
980 			{
981 				mit = begin(store.alignedReadStore, Standard()) + mateIdx;
982 				if ((*it).contigId == (*mit).contigId)
983 				{
984 					mpos = positionGapToSeq(contigGaps, _min((*mit).beginPos, (*mit).endPos)) + 1;
985 					if ((*it).beginPos < (*mit).beginPos)
986 						isize = positionGapToSeq(contigGaps, _max((*mit).beginPos, (*mit).endPos) - 1) + 2 - pos;
987 					else
988 						isize = mpos - positionGapToSeq(contigGaps, _max((*it).beginPos, (*it).endPos) - 1) - 2;
989 				}
990 				flag |= 0x0002;
991 				if ((*mit).beginPos > (*mit).endPos)
992 					flag |= 0x0020;
993 			}
994 			else
995 				flag |= 0x0008;					// mate is unmapped (actually we should check if the mate has no match at all)
996 
997 			signed char mateNo = getMateNo(store, readId);
998 			if (mateNo == 0) flag |= 0x0040;	// this read is the first in the pair
999 			if (mateNo == 1) flag |= 0x0080;	// this read is the second in the pair
1000 
1001 			if (readId < length(store.readStore))
1002 			{
1003 				TRead &read = store.readStore[readId];
1004 				if (read.matePairId != TRead::INVALID_ID)
1005 					flag |= 0x0001;
1006 			}
1007 
1008 			// <qname>
1009 			if (readId < length(store.readNameStore)) {
1010                 typedef typename Iterator<CharString, Standard>::Type TCharStringIterator;
1011                 for (TCharStringIterator it = begin(store.readNameStore[readId]); it != end(store.readNameStore[readId]); ++it) {
1012                     if (*it == ' ' || *it == '\t' || *it == '\n' || *it == '\r')
1013                         break;
1014                     _streamPut(target, *it);
1015                 }
1016             }
1017             _streamPut(target, '\t');
1018 
1019             // <flag>
1020             _streamPutInt(target, flag);
1021             _streamPut(target, '\t');
1022 
1023 			// <rname>
1024 			if ((*it).contigId < length(store.contigNameStore))
1025 				_streamWrite(target, store.contigNameStore[(*it).contigId]);
1026             else
1027                 _streamWrite(target, '.');  // No reference name given.  Standard says field must not be empty but gives no "NULL" value.
1028             _streamPut(target, '\t');
1029 
1030 			// <pos>
1031             _streamPutInt(target, pos);
1032             _streamPut(target, '\t');
1033 
1034 			// <mapq>
1035 			if (alignedId < length(store.alignQualityStore))
1036 				_streamPutInt(target, store.alignQualityStore[alignedId].score);
1037             else
1038                 _streamPutInt(target, 255);
1039             _streamPut(target, '\t');
1040 
1041 			// get read sequence
1042 			if (readId < length(store.readSeqStore))
1043 			{
1044 				readSeq = store.readSeqStore[readId];
1045 				if ((*it).beginPos <= (*it).endPos)
1046 				{
1047 					setBeginPosition(contigGaps, (*it).beginPos);
1048 					setEndPosition(contigGaps, (*it).endPos);
1049 				} else
1050 				{
1051 					setBeginPosition(contigGaps, (*it).endPos);
1052 					setEndPosition(contigGaps, (*it).beginPos);
1053 					reverseComplement(readSeq);
1054 				}
1055 			} else
1056 				clear(readSeq);
1057 
1058             // <cigar>
1059 			TReadGaps readGaps(readSeq, (*it).gaps);
1060             typedef Gaps<TContigSeq, AnchorGaps<typename TContig::TGapAnchors> >	TContigGaps2;
1061 			// TContigGaps	contigGaps2(store.contigStore[(*it).contigId].seq, store.contigStore[(*it).contigId].gaps);
1062             // if (i == 4)
1063             //     printf("It's it!\n");
1064             // std::cerr << "read gaps:  " << readGaps << std::endl;
1065             // std::cerr << "contig gaps:" << contigGaps2 << std::endl;
1066 			getCigarString(cigar, contigGaps, readGaps);
1067 
1068 			_streamWrite(target, cigar);
1069             _streamPut(target, '\t');
1070 
1071             // <mrnm>
1072 			if ((mateIdx < length(store.alignedReadStore)))
1073 			{
1074 				if ((*it).contigId == (*mit).contigId)
1075 					_streamWrite(target, '=');
1076 				else
1077 					if ((*mit).contigId < length(store.contigNameStore))
1078 						_streamWrite(target, store.contigNameStore[(*mit).contigId]);
1079 			} else
1080 				_streamWrite(target, '*');
1081 
1082             _streamPut(target, '\t');
1083 
1084             // <mpos>
1085 			_streamPutInt(target, (int)mpos);
1086             _streamPut(target, '\t');
1087 
1088             // <isize>
1089 			_streamPutInt(target, isize);
1090             _streamPut(target, '\t');
1091 
1092             // <seq>
1093 			_streamWrite(target, readSeq);
1094             _streamPut(target, '\t');
1095 
1096             // <qual>
1097 			TReadSeqIter it = begin(store.readSeqStore[readId], Standard());
1098 			TReadSeqIter itEnd = end(store.readSeqStore[readId], Standard());
1099 			for (; it != itEnd; ++it)
1100 				_streamPut(target, (char)(getQualityValue(*it) + 33));
1101 
1102 			// <tags>
1103 			if (alignedId < length(store.alignedReadTagStore) && !empty(store.alignedReadTagStore[alignedId]))
1104 			{
1105 				_streamPut(target, '\t');
1106 				_streamWrite(target, store.alignedReadTagStore[alignedId]);
1107 			}
1108 
1109             _streamPut(target, '\n');
1110         }
1111 
1112     }
1113 
1114 //////////////////////////////////////////////////////////////////////////////
1115 // write
1116 
1117 ///.Function.write.param.tag.type:Tag.File Format.tag.Sam
1118 
1119     template<typename TFile, typename TSpec, typename TConfig>
write(TFile & target,FragmentStore<TSpec,TConfig> & store,Sam)1120     inline void write(TFile & target,
1121                       FragmentStore<TSpec, TConfig> & store,
1122                       Sam)
1123     {
1124         // write header
1125 		_writeHeader(target, store, Sam());
1126 
1127         // write aligments
1128         _writeAlignments(target, store, Sam());
1129     }
1130 
1131 }// namespace SEQAN_NAMESPACE_MAIN
1132 
1133 #endif //#ifndef SEQAN_HEADER_...
1134