1 // ==========================================================================
2 //                 SeqAn - The Library for Sequence Analysis
3 // ==========================================================================
4 // Copyright (c) 2006-2015, Knut Reinert, FU Berlin
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are met:
9 //
10 //     * Redistributions of source code must retain the above copyright
11 //       notice, this list of conditions and the following disclaimer.
12 //     * Redistributions in binary form must reproduce the above copyright
13 //       notice, this list of conditions and the following disclaimer in the
14 //       documentation and/or other materials provided with the distribution.
15 //     * Neither the name of Knut Reinert or the FU Berlin nor the names of
16 //       its contributors may be used to endorse or promote products derived
17 //       from this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29 // DAMAGE.
30 //
31 // ==========================================================================
32 // Author: Tobias Rausch <rausch@embl.de>
33 // ==========================================================================
34 
35 #ifndef SEQAN_HEADER_CONSENSUS_BASE_H
36 #define SEQAN_HEADER_CONSENSUS_BASE_H
37 
38 namespace SEQAN_NAMESPACE_MAIN
39 {
40 
41 
42 //////////////////////////////////////////////////////////////////////////////
43 // Segment Match Generation tag
44 //////////////////////////////////////////////////////////////////////////////
45 
46 // TODO(holtgrew): This apparently belongs into graph_msa?
47 
48 struct OverlapLibrary_;
49 typedef Tag<OverlapLibrary_> const OverlapLibrary;
50 
51 
52 
53 //////////////////////////////////////////////////////////////////////////////
54 // Consensus tag
55 //////////////////////////////////////////////////////////////////////////////
56 
57 /*
58  * @defgroup ConsensusCallingTags Consensus Calling Tags
59  * @brief Tags for consensus calling.
60  *
61  * @tag ConsensusCallingTags#MajorityVote
62  * @headerfile <seqan/consensus.h>
63  * @brief Consensus based on the most common character.
64  *
65  * @tag ConsensusCallingTags#Bayesian
66  * @headerfile <seqan/consensus.h>
67  * @brief Consensus based on a bayesian probability.
68  */
69 
70 
71 struct MajorityVote_;
72 typedef Tag<MajorityVote_> const MajorityVote;
73 
74 struct Bayesian_;
75 typedef Tag<Bayesian_> const Bayesian;
76 
77 
78 
79 //////////////////////////////////////////////////////////////////////////////
80 // Read alignment and Consensus Generation
81 //////////////////////////////////////////////////////////////////////////////
82 
83 
84 //////////////////////////////////////////////////////////////////////////////////
85 
86 struct ConsensusOptions {
87 public:
88     // Method
89     // 0: graph-based multiple sequence alignment
90     // 1: realign
91     int method;
92 
93     // ReAlign Method
94     // 0: Needleman-Wunsch
95     // 1: Gotoh
96     int rmethod;
97 
98     // Bandwidth of overlap alignment
99     int bandwidth;
100 
101     // Number of computed overlaps per read (at the beginning and end of a read)
102     int overlaps;
103 
104     // Minimum match length of a computed overlap
105     int matchlength;
106 
107     // Minimum quality (in percent identity) of a computed overlap
108     int quality;
109 
110     // Window size, only relevant for insert sequencing
111     // If window == 0, no insert sequencing is assumed
112     int window;
113 
114     // Output
115     // 0: seqan style
116     // 1: afg output format
117     // 2: frg output format
118     // 3: cgb output format
119     // 4: Sam output format
120     int output;
121 
122     // Multi-read alignment
123     bool noalign;
124 
125     // Offset all reads, so the first read starts at position 0
126     bool moveToFront;
127 
128     // Include reference genome
129     bool include;
130 
131     // Scoring object for overlap alignments
132     Score<int> sc;
133 
134     // Various input and output files
135     std::string readsfile;                // File of reads in FASTA format
136     std::string afgfile;                // AMOS afg file input
137     std::string samfile;                // Sam file input
138     std::string contigsfile;            // FASTA reference file for Sam input
139     std::string outfile;                // Output file name
140 
141     // Initialization
ConsensusOptionsConsensusOptions142     ConsensusOptions() :
143         method(0), rmethod(0), bandwidth(0), overlaps(0), matchlength(0), quality(0),
144         window(0), output(0), noalign(false), moveToFront(0), include(0), sc(2, -6, -4, -9)
145     {}
146 };
147 
148 
149 //////////////////////////////////////////////////////////////////////////////////
150 
151 // Copy reads for the whole contig out of fragStore and into strSet.  The start and end
152 // positions of the alignments go into startEndPos.
153 
154 template<typename TValue, typename TStrSpec, typename TPosPair, typename TStringSpec, typename TSpec, typename TConfig, typename TId>
155 inline void
_loadContigReads(StringSet<TValue,Owner<TStrSpec>> & strSet,String<TPosPair,TStringSpec> & startEndPos,FragmentStore<TSpec,TConfig> const & fragStore,TId const contigId)156 _loadContigReads(StringSet<TValue, Owner<TStrSpec> > & strSet,
157                  String<TPosPair, TStringSpec> & startEndPos,
158                  FragmentStore<TSpec, TConfig> const & fragStore,
159                  TId const contigId)
160 {
161     typedef FragmentStore<TSpec, TConfig> const TFragmentStore;
162     typedef typename Size<TFragmentStore>::Type TSize;
163     typedef typename TFragmentStore::TReadPos TReadPos;
164 
165 
166     // Sort aligned reads according to contig id
167     sortAlignedReads(fragStore.alignedReadStore, SortContigId());
168     resize(strSet, length(fragStore.alignedReadStore));
169 
170     // Retrieve all reads, limit them to the clear range and if required reverse complement them
171     typedef typename Iterator<typename TFragmentStore::TAlignedReadStore const>::Type TAlignIter;
172     TAlignIter alignIt = lowerBoundAlignedReads(fragStore.alignedReadStore, contigId, SortContigId());
173     TAlignIter alignItEnd = upperBoundAlignedReads(fragStore.alignedReadStore, contigId, SortContigId());
174     TSize numRead = 0;
175     TReadPos begClr = 0;
176     TReadPos endClr = 0;
177     TSize lenRead = 0;
178     TSize offset = 0;
179     for(;alignIt != alignItEnd; ++alignIt) {
180         offset = _min(alignIt->beginPos, alignIt->endPos);
181         getClrRange(fragStore, *alignIt, begClr, endClr);
182         strSet[numRead] = infix(fragStore.readSeqStore[alignIt->readId], begClr, endClr);
183         lenRead = endClr - begClr;
184         if (alignIt->beginPos < alignIt->endPos) appendValue(startEndPos, TPosPair(offset, offset + lenRead), Generous());
185         else {
186             reverseComplement(strSet[numRead]);
187             appendValue(startEndPos, TPosPair(offset + lenRead, offset), Generous());
188         }
189         ++numRead;
190     }
191     resize(strSet, numRead, Exact());
192 }
193 
194 
195 
196 
197 //////////////////////////////////////////////////////////////////////////////
198 
199 template<typename TSpec, typename TConfig, typename TMatrix, typename TSize2, typename TSize, typename TReadSlot>
200 inline bool
convertAlignment(FragmentStore<TSpec,TConfig> & fragStore,TMatrix & mat,TSize2 contigId,TSize & coverage,TReadSlot & slot)201 convertAlignment(FragmentStore<TSpec, TConfig>& fragStore,
202                  TMatrix& mat,
203                  TSize2 contigId,
204                  TSize& coverage,
205                  TReadSlot& slot)
206 {
207     typedef FragmentStore<TSpec, TConfig> TFragmentStore;
208     typedef typename Value<TMatrix>::Type TValue;
209     typedef typename TFragmentStore::TContigPos TContigPos;
210 
211     // Gap char
212     TValue gapChar = gapValue<TValue>();
213 
214     // Sort according to contigId
215     sortAlignedReads(fragStore.alignedReadStore, SortContigId());
216 
217     // Find range of the given contig
218     typedef typename Iterator<typename TFragmentStore::TAlignedReadStore, Standard>::Type TAlignIter;
219     TAlignIter alignIt = lowerBoundAlignedReads(fragStore.alignedReadStore, contigId, SortContigId());
220     TAlignIter alignItEnd = upperBoundAlignedReads(fragStore.alignedReadStore, contigId, SortContigId());
221 
222     // Sort the reads according to the begin position
223     sortAlignedReads(infix(fragStore.alignedReadStore, alignIt - begin(fragStore.alignedReadStore, Standard()), alignItEnd - begin(fragStore.alignedReadStore, Standard())), SortBeginPos());
224     TAlignIter alignItBegin = alignIt = lowerBoundAlignedReads(fragStore.alignedReadStore, contigId, SortContigId());
225     alignItEnd = upperBoundAlignedReads(fragStore.alignedReadStore, contigId, SortContigId());
226 
227     // Get the maximum coverage and the slot for each read
228     typedef String<TSize> TFirstFreePos;
229     typedef typename Iterator<TFirstFreePos, Standard>::Type TPosIter;
230     TFirstFreePos freePos;
231     TSize pos = 0;
232     TSize maxTmp = 0;
233     TSize numCol = 0;
234     reserve(slot, alignItEnd - alignIt);
235     for(;alignIt != alignItEnd; ++alignIt) {
236         TPosIter itPos = begin(freePos, Standard());
237         TPosIter itPosEnd = end(freePos, Standard());
238         pos = 0;
239         for(;itPos != itPosEnd; ++itPos, ++pos)
240             if ((TContigPos)*itPos < _min(alignIt->beginPos, alignIt->endPos)) break;
241         if (pos + 1 > length(freePos)) resize(freePos, pos+1, Generous());
242         maxTmp = _max(alignIt->beginPos, alignIt->endPos);
243         freePos[pos] = maxTmp;
244         if (maxTmp > numCol) numCol = maxTmp;
245         appendValue(slot, pos);
246     }
247     coverage = length(freePos);
248     clear(freePos);
249 
250     // Fill the matrix
251     typedef typename Iterator<TMatrix, Standard>::Type TMatIter;
252     resize(mat, coverage * numCol, '.');
253     alignIt = alignItBegin;
254     TSize readPos = 0;
255     TMatIter matIt = begin(mat, Standard());
256     typename TFragmentStore::TReadSeq myRead;
257     for(;alignIt != alignItEnd; ++alignIt, ++readPos) {
258         typedef typename Iterator<String<typename TFragmentStore::TReadGapAnchor>, Standard>::Type TReadGapsIter;
259         TReadGapsIter itGaps = begin(alignIt->gaps, Standard());
260         TReadGapsIter itGapsEnd = end(alignIt->gaps, Standard());
261 
262         // Place each read inside the matrix
263         myRead = fragStore.readSeqStore[alignIt->readId];
264         TSize lenRead = length(myRead);
265         TSize offset = alignIt->beginPos;
266         if (alignIt->beginPos > alignIt->endPos) {
267             reverseComplement(myRead);
268             offset = alignIt->endPos;
269         }
270         matIt = begin(mat, Standard());
271         matIt += (slot[readPos] * numCol + offset);
272 
273         typedef typename Iterator<typename TFragmentStore::TReadSeq, Standard>::Type TReadIter;
274         TReadIter seqReadIt = begin(myRead, Standard());
275 
276         // First clear range
277         TSize mySeqPos = 0;
278         int diff = 0;
279         if ((itGaps != itGapsEnd) && (itGaps->gapPos == 0)) {
280             mySeqPos = itGaps->seqPos;
281             diff = -1 * mySeqPos;
282             seqReadIt += mySeqPos;
283         }
284         TSize clr2 = lenRead;
285         TSize stop = 0;
286         for(;itGaps != itGapsEnd; ++itGaps) {
287             // Any clipped sequence at the end
288             stop =  itGaps->seqPos;
289             if (diff - ((int) itGaps->gapPos - (int) itGaps->seqPos) > 0)
290                 clr2 = stop = lenRead - (diff - ((int) itGaps->gapPos - (int) itGaps->seqPos));
291 
292             for(;mySeqPos < stop; ++matIt, ++seqReadIt, ++mySeqPos)
293                 *matIt = *seqReadIt;
294 
295             for(int i = 0; i < ((int) itGaps->gapPos - (int) itGaps->seqPos) - diff; ++i, ++matIt)
296                 *matIt = gapChar;
297 
298             diff = (itGaps->gapPos - itGaps->seqPos);
299         }
300         for(;mySeqPos < clr2; ++mySeqPos, ++seqReadIt, ++matIt)
301             *matIt = *seqReadIt;
302     }
303     //for(TSize row = 0; row < coverage; ++row) {
304     //    for(TSize col = 0; col<numCol; ++col) {
305     //        std::cout << mat[row * numCol + col];
306     //    }
307     //    std::cout << std::endl;
308     //}
309     return true;
310 }
311 
312 //////////////////////////////////////////////////////////////////////////////
313 
314 template<typename TSpec, typename TConfig, typename TMatrix, typename TSize2, typename TSize>
315 inline bool
convertAlignment(FragmentStore<TSpec,TConfig> & fragStore,TMatrix & mat,TSize2 contigId,TSize & coverage)316 convertAlignment(FragmentStore<TSpec, TConfig>& fragStore,
317                  TMatrix& mat,
318                  TSize2 contigId,
319                  TSize& coverage)
320 {
321     String<TSize> slot;
322     return convertAlignment(fragStore, mat, contigId, coverage, slot);
323 }
324 
325 //////////////////////////////////////////////////////////////////////////////
326 
327 template<typename TSpec, typename TConfig, typename TMatrix>
328 inline bool
convertAlignment(FragmentStore<TSpec,TConfig> & fragStore,TMatrix & mat)329 convertAlignment(FragmentStore<TSpec, TConfig>& fragStore,
330                  TMatrix& mat)
331 {
332     typedef FragmentStore<TSpec, TConfig> TFragmentStore;
333     typedef typename Size<TFragmentStore>::Type TSize;
334     TSize coverage;
335     return convertAlignment(fragStore, mat, 0, coverage);
336 }
337 
338 //////////////////////////////////////////////////////////////////////////////
339 
340 template<typename TSpec, typename TConfig, typename TGappedConsensus, typename TSize>
341 inline void
getGappedConsensus(FragmentStore<TSpec,TConfig> & fragStore,TGappedConsensus & gappedConsensus,TSize contigId)342 getGappedConsensus(FragmentStore<TSpec, TConfig>& fragStore,
343                    TGappedConsensus& gappedConsensus,
344                    TSize contigId)
345 {
346     typedef FragmentStore<TSpec, TConfig> TFragmentStore;
347     typedef typename Value<TGappedConsensus>::Type TValue;
348     typedef typename TFragmentStore::TContigPos TContigPos;
349 
350     TValue gapChar = gapValue<TValue>();
351     typedef typename Iterator<typename TFragmentStore::TContigSeq, Standard>::Type TContigIter;
352     TContigIter seqContigIt = begin(fragStore.contigStore[contigId].seq, Standard());
353     TContigIter seqContigItEnd = end(fragStore.contigStore[contigId].seq, Standard());
354     typedef typename Iterator<String<typename TFragmentStore::TContigGapAnchor>, Standard>::Type TGapsIter;
355     TGapsIter itGaps = begin(fragStore.contigStore[contigId].gaps, Standard());
356     TGapsIter itGapsEnd = end(fragStore.contigStore[contigId].gaps, Standard());
357     int diff = 0;
358     TContigPos mySeqPos = 0;
359     for(;itGaps != itGapsEnd; goNext(itGaps)) {
360         for(;mySeqPos < itGaps->seqPos; ++seqContigIt, ++mySeqPos)
361             appendValue(gappedConsensus, *seqContigIt, Generous());
362 
363         for(int i = 0; i < ((int) itGaps->gapPos - (int) itGaps->seqPos) - diff; ++i)
364             appendValue(gappedConsensus, gapChar, Generous());
365             diff = (itGaps->gapPos - itGaps->seqPos);
366     }
367     for(;seqContigIt != seqContigItEnd; ++seqContigIt)
368         appendValue(gappedConsensus, *seqContigIt, Generous());
369 }
370 
371 
372 //////////////////////////////////////////////////////////////////////////////
373 
374 template<typename TSpec, typename TConfig, typename TGappedConsensus, typename TSize>
375 inline void
assignGappedConsensus(FragmentStore<TSpec,TConfig> & fragStore,TGappedConsensus & gappedCons,TSize contigId)376 assignGappedConsensus(FragmentStore<TSpec, TConfig>& fragStore,
377                       TGappedConsensus& gappedCons,
378                       TSize contigId)
379 {
380     typedef FragmentStore<TSpec, TConfig> TFragmentStore;
381     typedef typename Value<TGappedConsensus>::Type TValue;
382     TValue gapChar = gapValue<TValue>();
383 
384     // Update the contig
385     typedef typename Value<typename TFragmentStore::TContigStore>::Type TContigStoreElement;
386     TContigStoreElement& contigEl = fragStore.contigStore[contigId];
387     clear(contigEl.gaps);
388     clear(contigEl.seq);
389 
390     // Create the sequence and the gap anchors
391     typedef typename Iterator<TGappedConsensus, Standard>::Type TStringIter;
392     TStringIter seqIt = begin(gappedCons, Standard());
393     TStringIter seqItEnd = end(gappedCons, Standard());
394     typedef typename TFragmentStore::TReadPos TReadPos;
395     typedef typename TFragmentStore::TContigGapAnchor TContigGapAnchor;
396     TReadPos ungappedPos = 0;
397     TReadPos gappedPos = 0;
398     bool gapOpen = false;
399     for(;seqIt != seqItEnd; ++seqIt, ++gappedPos) {
400         if (*seqIt == gapChar) gapOpen = true;
401         else {
402             if (gapOpen) {
403                 appendValue(contigEl.gaps, TContigGapAnchor(ungappedPos, gappedPos), Generous());
404                 gapOpen = false;
405             }
406             Dna5Q letter = *seqIt;
407             assignQualityValue(letter, 'D');
408             appendValue(contigEl.seq, letter);
409             ++ungappedPos;
410         }
411     }
412     if (gapOpen)
413         appendValue(contigEl.gaps, TContigGapAnchor(ungappedPos, gappedPos), Generous());
414 }
415 
416 
417 
418 //////////////////////////////////////////////////////////////////////////////////
419 
420 /*
421  * @fn consensusAlignment
422  * @headerfile <seqan/consensus.h>
423  * @brief Compute consensus alignment.
424  *
425  * @signature void consensusAlignment(alignmentGraph, beginEndPos[, options])
426  *
427  * @param[out] alignmentGraph  Alignment graph to build.
428  * @param[in]  options         Optional settings for the consenus alignment, type: <tt>ConsensusOptions</tt>.
429  * @param[in]  beginEndPos     Interval start and end position for the read's alignment; <tt>String&lt;Pair&lt;TPos, TPos&gt; &gt;</tt>.
430  *
431  * @section Example
432  *
433  * @code{.cpp}
434  * #include <seqan/sequence.h>
435  * #include <seqan/graph_align.h>
436  * #include <seqan/consensus.h>
437  *
438  * int main()
439  * {
440  *     using namespace seqan;
441  *
442  *     typedef StringSet<Dna5String> TStringSet;
443  *     typedef Graph<Alignment<TStringSet, void, WithoutEdgeId> > TAlignGraph;
444  *
445  *     TStringSet readSet;
446  *     String<Pair<TSize> > begEndPos;
447  *
448  *     appendValue(readSet, "CCCAGTGA");
449  *     appendValue(begEndPos, Pair<TSize>(0, 5));
450  *     appendValue(readSet, "AGGGACTGT");
451  *     appendValue(begEndPos, Pair<TSize>(3, 9));
452  *
453  *     TAlignGraph alignmentGraph(readSet);
454  *     consensusAlignment(alignmentGraph, begEndPos);
455  *
456  *     return 0;
457  * }
458  * @endcode
459  */
460 
461 template<typename TStringSet, typename TCargo, typename TSpec, typename TSize, typename TConfigOptions>
462 inline void
consensusAlignment(Graph<Alignment<TStringSet,TCargo,TSpec>> & gOut,String<Pair<TSize,TSize>> & begEndPos,TConfigOptions const & consOpt)463 consensusAlignment(Graph<Alignment<TStringSet, TCargo, TSpec> >& gOut,
464                    String<Pair<TSize, TSize> >& begEndPos,
465                    TConfigOptions const& consOpt)
466 {
467     typedef Graph<Alignment<TStringSet, TCargo, TSpec> > TOutGraph;
468     typedef typename Id<TOutGraph>::Type TId;
469 
470     // Initialization
471     TStringSet& seqSet = stringSet(gOut);
472 
473     // Select all overlapping reads and record the diagonals of the band
474     String<Pair<TId, TId> > pList;
475     String<Pair<int, int> > diagList;
476     if (consOpt.window == 0)
477         selectPairsAssembly(seqSet, begEndPos, consOpt.bandwidth, pList, diagList);
478     else
479         selectPairsAllAgainstAll(seqSet, begEndPos, consOpt.window, pList, diagList);
480 
481     // Set-up a sparse distance matrix
482     Graph<Undirected<double> > pairGraph;
483 
484     // Containers for segment matches and corresponding scores
485     typedef String<Fragment<> > TFragmentString;
486     TFragmentString matches;
487     typedef String<int> TScoreValues;
488     TScoreValues scores;
489 
490     // Compute segment matches from global pairwise alignments
491     appendSegmentMatches(seqSet, pList, diagList, begEndPos, consOpt.sc, consOpt.matchlength, consOpt.quality, consOpt.overlaps, matches, scores, pairGraph, OverlapLibrary() );
492     clear(pList);
493     clear(diagList);
494 
495     // If there are no alignment matches, return
496     if (!length(matches)) return;
497 
498     // Use these segment matches for the initial alignment graph
499     typedef Graph<Alignment<TStringSet, TSize> > TGraph;
500     TGraph g(seqSet);
501     buildAlignmentGraph(matches, scores, g, consOpt.sc, ReScore() );
502     clear(matches);
503     clear(scores);
504 
505     // Guide Tree
506     Graph<Tree<double> > guideTree;
507     upgmaTree(pairGraph, guideTree);
508     clear(pairGraph);
509 
510     // Triplet library extension
511     graphBasedTripletLibraryExtension(g);
512 
513     // Perform a progressive alignment
514     progressiveAlignment(g, guideTree, gOut);
515     clear(g);
516     clear(guideTree);
517 }
518 
519 //////////////////////////////////////////////////////////////////////////////////
520 
521 
522 template<typename TStringSet, typename TCargo, typename TSpec, typename TSize>
523 inline void
consensusAlignment(Graph<Alignment<TStringSet,TCargo,TSpec>> & gOut,String<Pair<TSize,TSize>> & begEndPos)524 consensusAlignment(Graph<Alignment<TStringSet, TCargo, TSpec> >& gOut,
525                    String<Pair<TSize, TSize> >& begEndPos)
526 {
527     ConsensusOptions consOpt;
528     consensusAlignment(gOut, begEndPos, consOpt);
529 }
530 
531 
532 //////////////////////////////////////////////////////////////////////////////
533 
534 template <typename TFragSpec, typename TConfig, typename TStringSet, typename TCargo, typename TSpec, typename TContigId>
535 inline void
updateContig(FragmentStore<TFragSpec,TConfig> & fragStore,Graph<Alignment<TStringSet,TCargo,TSpec>> const & g,TContigId contigId)536 updateContig(FragmentStore<TFragSpec, TConfig>& fragStore,
537              Graph<Alignment<TStringSet, TCargo, TSpec> > const& g,
538              TContigId contigId)
539 {
540     SEQAN_CHECKPOINT
541     typedef Graph<Alignment<TStringSet, TCargo, TSpec> > TGraph;
542     typedef typename Size<TGraph>::Type TSize;
543     typedef typename Value<TStringSet>::Type TString;
544     typedef typename VertexDescriptor<TGraph>::Type TVertexDescriptor;
545     typedef std::map<TSize, TSize> TComponentLength;
546     typedef char TValue;
547 
548     // Initialization
549     TStringSet& strSet = stringSet(g);
550     TSize nseq = length(strSet);
551     TValue gapChar = gapValue<TValue>();
552     TValue specialGap = '.';
553     TSize maxCoverage = 0;
554     TSize len = 0;
555     String<TValue> mat;
556 
557     // Store for each read the begin position, the end position and the row in the alignment matrix
558     String<TSize> readBegEndRowPos;
559     resize(readBegEndRowPos, 3*nseq);
560 
561     // Strongly Connected Components, topological sort, and length of each component
562     String<TSize> component;
563     String<TSize> order;
564     TComponentLength compLength;
565     if (convertAlignment(g, component, order, compLength)) {
566         TSize numOfComponents = length(order);
567 
568         // Assign to each sequence the start and end (in terms of component ranks)
569         typedef String<std::pair<TSize, TSize> > TComponentToRank;
570         TComponentToRank compToRank;
571         for(TSize compIndex = 0; compIndex < numOfComponents; ++compIndex)
572             appendValue(compToRank, std::make_pair(order[compIndex], compIndex), Generous());
573         std::sort(begin(compToRank, Standard()), end(compToRank, Standard()));
574 
575         typedef Pair<TSize, TSize> TRankPair;
576         typedef String<TRankPair> TSequenceToRanks;
577         TSequenceToRanks seqToRank;
578         resize(seqToRank, nseq);
579         typedef typename Iterator<TGraph, VertexIterator>::Type TVertexIterator;
580         TVertexIterator itVertex(g);
581         for(;!atEnd(itVertex);++itVertex) {
582             TVertexDescriptor vert = value(itVertex);
583             TSize seq = idToPosition(strSet, sequenceId(g, vert));
584             if (fragmentBegin(g, vert) == 0)
585                 seqToRank[seq].i1 = std::lower_bound(begin(compToRank, Standard()), end(compToRank, Standard()), std::make_pair((TSize) component[vert], (TSize) 0))->second;
586             if (fragmentBegin(g, vert) + fragmentLength(g, vert) == length(strSet[seq]))
587                 seqToRank[seq].i2 = std::lower_bound(begin(compToRank, Standard()), end(compToRank, Standard()), std::make_pair((TSize) component[vert], (TSize) 0))->second;
588         }
589         clear(compToRank);
590 
591         // Assign the sequences to rows
592         String<TSize> seqToRow;
593         resize(seqToRow, nseq);
594         maxCoverage = 0;
595         typedef String<bool> TLeftOver;
596         typedef typename Iterator<TLeftOver, Standard>::Type TLeftOverIter;
597         TLeftOver leftOver;
598         resize(leftOver, nseq, true);
599         typedef String<std::pair<TSize, TSize> > TSeqToBegin;
600         typedef typename Iterator<TSeqToBegin, Standard>::Type TSeqToBeginIter;
601         TSeqToBegin seqToBegin;
602         TSize finishedSeq = 0;
603         while(finishedSeq < nseq) {
604             TLeftOverIter itL = begin(leftOver, Standard());
605             TLeftOverIter itLEnd = end(leftOver, Standard());
606             for(TSize pos = 0; itL != itLEnd; ++itL, ++pos)
607                 if (*itL) appendValue(seqToBegin, std::make_pair((seqToRank[pos]).i1, pos), Generous());
608             std::sort(begin(seqToBegin, Standard()), end(seqToBegin, Standard()));
609 
610             TSize endPos = 0;
611             TSeqToBeginIter itSB = begin(seqToBegin, Standard());
612             TSeqToBeginIter itSBEnd = end(seqToBegin, Standard());
613             for(;itSB != itSBEnd;++itSB) {
614                 if (endPos <= (*itSB).first) {
615                     TSize currentSeq = (*itSB).second;
616                     seqToRow[currentSeq] = maxCoverage;
617                     endPos = (seqToRank[currentSeq]).i2 + 2;
618                     leftOver[currentSeq] = false;
619                     ++finishedSeq;
620                 }
621             }
622             clear(seqToBegin);
623             ++maxCoverage;
624         }
625         clear(leftOver);
626 
627         // Create the matrix
628         len = 0;
629         String<TSize> compOffset;
630         resize(compOffset, numOfComponents);
631         for(TSize compIndex = 0; compIndex < numOfComponents; ++compIndex) {
632             compOffset[order[compIndex]] = len;
633             len+=compLength[order[compIndex]];
634         }
635         resize(mat, len * maxCoverage, gapChar);
636 
637         // Fill in the segments
638         typedef typename Infix<TString>::Type TInfix;
639         typedef typename Iterator<TInfix, Standard>::Type TInfixIter;
640         typedef typename TGraph::TPosToVertexMap_ TPosToVertexMap;
641         for(typename TPosToVertexMap::const_iterator it = g.data_pvMap.begin();it != g.data_pvMap.end(); ++it) {
642             TInfix str = label(g,it->second);
643             TSize c = property(component, it->second);
644             TSize row = seqToRow[idToPosition(strSet, it->first.first)];
645             //if (row == 0) {
646             //    std::cout << sequenceId(g, it->second) << ':' << str << ',' << strSet[sequenceId(g, it->second)] << std::endl;
647             //    std::cout << getProperty(component, it->second) << ',' << order[compIndex] << std::endl;
648             //    std::cout << (seqToRank[sequenceId(g, it->second)]).i1 << ',' << (seqToRank[sequenceId(g, it->second)]).i2 << std::endl;
649             //}
650             TInfixIter sIt = begin(str, Standard());
651             TInfixIter sItEnd = end(str, Standard());
652             TSize i = compOffset[c];
653             for(TSize pCol = i;sIt!=sItEnd;++sIt, ++pCol, ++i)
654                 mat[row * len + pCol] = *sIt;
655         }
656         String<bool> active;
657         for(TSize compIndex = 0; compIndex < numOfComponents; ++compIndex) {
658             TSize offset = compOffset[order[compIndex]];
659             TSize currentCompLength = compLength[order[compIndex]];
660 
661             clear(active);
662             resize(active, maxCoverage, false);
663 
664             // Find the empty rows
665             for(TSize i=0;i<nseq; ++i) {
666                 if (((seqToRank[i]).i1 <= compIndex) && ((seqToRank[i]).i2 >= compIndex))
667                     active[(seqToRow[i])] = true;
668             }
669 
670             // Substitute false gaps with special gap character
671             for(TSize i = 0; i < maxCoverage; ++i) {
672                 if (!(active[i])) {
673                     for(TSize pCol = offset;pCol < offset + currentCompLength;++pCol)
674                         mat[i * len + pCol] = specialGap;
675                 }
676             }
677         }
678 
679         // Get the new begin and end positions
680         for(TSize i=0;i<nseq; ++i) {
681             TVertexDescriptor lastVertex = findVertex(const_cast<TGraph&>(g), positionToId(strSet, i), length(strSet[i]) - 1);
682             TSize readBegin = compOffset[getProperty(component, findVertex(const_cast<TGraph&>(g), positionToId(strSet, i), 0))];
683             TSize readEnd = compOffset[getProperty(component, lastVertex)] + fragmentLength(const_cast<TGraph&>(g), lastVertex);
684             readBegEndRowPos[3*i] = readBegin;
685             readBegEndRowPos[3*i+1] = readEnd;
686             readBegEndRowPos[3*i+2] = seqToRow[i];
687         }
688 
689     }
690     clear(component);
691     clear(order);
692     compLength.clear();
693 
694 
695     //// Debug code
696     //for(TSize row = 0; row<maxCoverage; ++row) {
697     //    for(TSize col = 0; col<len; ++col) {
698     //        std::cout << mat[row * len + col];
699     //    }
700     //    std::cout << std::endl;
701     //}
702 
703     // Create the new consensus
704     typedef typename Value<TString>::Type TAlphabet;
705     String<TValue> gappedCons;
706     consensusCalling(mat, gappedCons, maxCoverage, TAlphabet(), MajorityVote());
707 
708     // Assign new consensus
709     assignGappedConsensus(fragStore, gappedCons, contigId);
710 
711     // Update all aligned reads
712     typedef FragmentStore<TSpec, TConfig> TFragmentStore;
713     typedef typename TFragmentStore::TReadPos TReadPos;
714     typedef typename TFragmentStore::TContigPos TContigPos;
715     typedef typename TFragmentStore::TContigGapAnchor TContigGapAnchor;
716     //typedef typename Value<typename TFragmentStore::TAlignedReadStore>::Type TAlignedElement;
717     typedef typename Iterator<typename TFragmentStore::TAlignedReadStore>::Type TAlignIter;
718     sortAlignedReads(fragStore.alignedReadStore, SortContigId());
719     TAlignIter alignIt = lowerBoundAlignedReads(fragStore.alignedReadStore, contigId, SortContigId());
720     TAlignIter alignItEnd = upperBoundAlignedReads(fragStore.alignedReadStore, contigId, SortContigId());
721     TReadPos ungappedPos = 0;
722     TReadPos gappedPos = 0;
723     bool gapOpen;
724     for(TSize i = 0;alignIt != alignItEnd; ++alignIt, ++i) {
725         TSize lenRead = length(fragStore.readSeqStore[alignIt->readId]);
726         TReadPos begClr = 0;
727         TReadPos endClr = 0;
728         getClrRange(fragStore, *alignIt, begClr, endClr);
729         clear(alignIt->gaps);
730         ungappedPos = begClr;
731         if (alignIt->beginPos > alignIt->endPos) ungappedPos = lenRead - endClr;
732         if (ungappedPos != 0) appendValue(alignIt->gaps, TContigGapAnchor(ungappedPos, 0));
733         gappedPos = 0;
734         gapOpen = false;
735         for(TSize column = readBegEndRowPos[3*i]; column<readBegEndRowPos[3*i + 1]; ++column, ++gappedPos) {
736             if (mat[readBegEndRowPos[3*i + 2] * len + column] == gapChar) gapOpen = true;
737             else {
738                 if (gapOpen) {
739                     appendValue(alignIt->gaps, TContigGapAnchor(ungappedPos, gappedPos), Generous());
740                     gapOpen = false;
741                 }
742                 ++ungappedPos;
743             }
744         }
745         if (gapOpen) appendValue(alignIt->gaps, TContigGapAnchor(ungappedPos, gappedPos), Generous());
746         if (alignIt->beginPos < alignIt->endPos) {
747             if (endClr != (TContigPos)lenRead)
748                 appendValue(alignIt->gaps, TContigGapAnchor(lenRead, lenRead + (gappedPos - ungappedPos) - (lenRead - endClr)), Generous());
749         } else {
750             if (begClr != 0)
751                 appendValue(alignIt->gaps, TContigGapAnchor(lenRead, lenRead + (gappedPos - ungappedPos) - begClr), Generous());
752         }
753 
754         // Set new begin and end position
755         if (alignIt->beginPos < alignIt->endPos) {
756             alignIt->beginPos = readBegEndRowPos[3*i];
757             alignIt->endPos = readBegEndRowPos[3*i+1];
758         } else {
759             alignIt->beginPos = readBegEndRowPos[3*i+1];
760             alignIt->endPos = readBegEndRowPos[3*i];
761         }
762     }
763 }
764 
765 
766 //////////////////////////////////////////////////////////////////////////////
767 
768 template <typename TValue, typename TSpec, typename TCounters, typename TSize, typename TAlphabet>
769 inline void
_countLetters(String<TValue,TSpec> const & mat,TCounters & counterValues,TSize alignDepth,TAlphabet)770 _countLetters(String<TValue, TSpec> const & mat,
771               TCounters & counterValues,
772               TSize alignDepth,
773               TAlphabet)
774 {
775     typedef String<TValue, TSpec> const TMatrix;
776     typedef typename Iterator<TMatrix, Standard>::Type TMatIter;
777 
778     // Initialization
779     TSize len = length(mat) / alignDepth;
780     TValue gapChar = gapValue<TValue>();
781     TValue specialGap = '.';
782     TSize alphabetSize = ValueSize<TAlphabet>::VALUE;
783 
784     // Set-up counter values
785     typedef typename Value<TCounters>::Type TCounter;
786     typedef typename Iterator<TCounters, Standard>::Type TCounterIt;
787     resize(counterValues, len);
788     for(TSize i=0;i<len; ++i) {
789         TCounter counter;
790         resize(counter, alphabetSize + 1, 0);
791         counterValues[i] = counter;
792     }
793 
794     // Count all
795     TMatIter matIt = begin(mat, Standard());
796     TMatIter matItEnd = end(mat, Standard());
797     TCounterIt countIt = begin(counterValues, Standard());
798     TSize pos = 0;
799     for(; matIt != matItEnd; ++matIt, ++countIt, ++pos) {
800         if (pos % len == 0) countIt = begin(counterValues, Standard());
801         if (*matIt != specialGap) {
802             if (*matIt == gapChar) ++value(*countIt, alphabetSize);
803             else ++value(*countIt, ordValue((TAlphabet) *matIt));
804         }
805     }
806 }
807 
808 
809 //////////////////////////////////////////////////////////////////////////////
810 
811 template <typename TValue, typename TSpec, typename TGappedCons, typename TAlignDepth, typename TAlphabet>
812 inline void
consensusCalling(String<TValue,TSpec> const & mat,TGappedCons & gappedConsensus,TAlignDepth maxCoverage,TAlphabet,Bayesian)813 consensusCalling(String<TValue, TSpec> const& mat,
814                  TGappedCons& gappedConsensus,
815                  TAlignDepth maxCoverage,
816                  TAlphabet,
817                  Bayesian)
818 {
819     typedef double TProbability;
820     typedef String<TProbability> TProbabilityDistribution;
821     typedef String<TProbabilityDistribution> TPositionalPrDist;
822     typedef typename Iterator<TPositionalPrDist, Standard>::Type TPosPrDistIter;
823 
824     typedef typename Size<String<TValue, TSpec> >::Type TSize;
825     TSize alphabetSize = ValueSize<TAlphabet>::VALUE;
826     TValue gapChar = gapValue<TValue>();
827     TValue specialGap = '.';
828 
829     // Set-up the counters
830     typedef String<TSize> TCounter;
831     typedef String<TCounter> TCounters;
832     TCounters counterValues;
833     _countLetters(mat, counterValues, maxCoverage, TAlphabet() );
834 
835 
836     // Initialization
837     TSize len = length(mat) / maxCoverage;
838     TProbabilityDistribution backroundDist;
839     resize(backroundDist, alphabetSize + 1, ((TProbability) 1 / (TProbability) (alphabetSize + 1)));
840 
841     // Get an initial consensus
842     typedef typename Iterator<TCounters, Standard>::Type TCounterIt;
843     TCounterIt countIt = begin(counterValues, Standard());
844     TCounterIt countItEnd = end(counterValues, Standard());
845     TPositionalPrDist posPrDist;
846     TValue c = TAlphabet();
847     for(;countIt != countItEnd; ++countIt) {
848         TSize max = 0;
849         typedef typename Iterator<TCounter, Standard>::Type TCIt;
850         TCIt cIt = begin(*countIt, Standard());
851         TCIt cItEnd = end(*countIt, Standard());
852         TSize pos = 0;
853         for(;cIt != cItEnd; ++cIt, ++pos) {
854             if (*cIt > max) {
855                 max = *cIt;
856                 c = (pos == alphabetSize) ? gapChar : (TValue) TAlphabet(pos);
857             }
858         }
859         TProbabilityDistribution prDist;
860         resize(prDist, alphabetSize + 1, 0);
861         if (c == gapChar) prDist[alphabetSize] = 1;
862         else prDist[ordValue((TAlphabet) c)] = 1;
863         appendValue(posPrDist, prDist, Generous());
864     }
865 
866     TSize run = 1;
867     TProbabilityDistribution pI;
868     TProbabilityDistribution pIJ;
869     TProbabilityDistribution pIOld;
870     TProbabilityDistribution pIJOld;
871     while (run) {
872         // Store the values from the last iteration
873         pIOld = pI;
874         pIJOld = pIJ;
875 
876         // Count all letters in the consensus
877         TProbabilityDistribution nI;
878         resize(nI, alphabetSize + 1, 0);
879         TPosPrDistIter itPosPrDist = begin(posPrDist, Standard());
880         TPosPrDistIter itPosPrDistEnd = end(posPrDist, Standard());
881         for(;itPosPrDist!=itPosPrDistEnd; ++itPosPrDist)
882             for(TSize i = 0; i<(alphabetSize + 1); ++i)
883                 nI[i] += (*itPosPrDist)[i];
884 
885         // Composition probabilities
886         clear(pI);
887         resize(pI, alphabetSize + 1);
888         TProbability lenPosPrDist = (TProbability) length(posPrDist);
889         for(TSize i = 0; i<length(pI); ++i)
890             pI[i] = nI[i] / lenPosPrDist;
891 
892 
893         // Count all letters that agree / disagree with the consensus
894         TProbabilityDistribution nIJ;
895         resize(nIJ, (alphabetSize + 1) * (alphabetSize + 1), 0);
896         typedef String<TValue, TSpec> TMatrix;
897         typedef typename Iterator<TMatrix, Standard>::Type TMatIter;
898         TMatIter matIt = begin(mat, Standard());
899         TMatIter matItEnd = end(mat, Standard());
900         itPosPrDist = begin(posPrDist, Standard());
901         TSize pos = 0;
902         for(; matIt != matItEnd; ++matIt, ++itPosPrDist, ++pos) {
903             if (pos % len == 0) itPosPrDist = begin(posPrDist, Standard());
904             TValue c = *matIt;
905             if (c != specialGap) {
906                 TSize fragJ = (c != gapChar) ? ordValue(TAlphabet(c)) : alphabetSize;
907                 for(TSize consI = 0; consI<(alphabetSize + 1); ++consI)
908                     nIJ[consI * (alphabetSize + 1) + fragJ] += (*itPosPrDist)[consI];
909             }
910         }
911 
912         // Sequencing error probabilities
913         clear(pIJ);
914         resize(pIJ, (alphabetSize + 1) * (alphabetSize + 1));
915         TProbability sumIJ = 0;
916         for(TSize diag = 0; diag<(alphabetSize + 1); ++diag) sumIJ += nIJ[diag * (alphabetSize + 1) + diag];
917         for(TSize consI = 0; consI<(alphabetSize + 1); ++consI)
918             for(TSize fragJ = 0; fragJ<(alphabetSize + 1); ++fragJ)
919                 pIJ[consI * (alphabetSize + 1) + fragJ] = nIJ[consI * (alphabetSize + 1) + fragJ] / sumIJ;
920 
921         // Recompute positional probability distribution
922         itPosPrDist = begin(posPrDist, Standard());
923         TSize col = 0;
924         for(;itPosPrDist!=itPosPrDistEnd; ++itPosPrDist, ++col) {
925             TProbabilityDistribution prDist;
926             resize(prDist, alphabetSize + 1);
927             for(TSize consI = 0; consI<(alphabetSize + 1); ++consI) {
928                 TProbability numerator = pI[consI];
929                 TProbability denominator = 0;
930                 for(TSize allI = 0; allI<(alphabetSize + 1); ++allI) {
931                     TProbability denominatorSub = value(pI, allI);
932                     for(TSize row = 0; row < maxCoverage; ++row) {
933                         TValue c = mat[row * len + col];
934                         if (c != specialGap) {
935                             TSize fragJ = (c != gapChar) ? ordValue(TAlphabet(c)) : alphabetSize;
936                             if (allI == consI)
937                                 numerator *= pIJ[allI * (alphabetSize + 1) + fragJ];
938                             denominatorSub *= pIJ[allI * (alphabetSize + 1) + fragJ];
939                         }
940                     }
941                     denominator += denominatorSub;
942                 }
943                 prDist[consI] = numerator / denominator;
944             }
945             *itPosPrDist = prDist;
946         }
947 
948         // Check termination criterion
949         TProbability eps = 0.00001;
950         typedef typename Iterator<TProbabilityDistribution, Standard>::Type TProbIter;
951         TProbIter pIter = begin(pIOld, Standard());
952         TProbIter pIterCompare = begin(pI, Standard());
953         TProbIter pIterEnd = end(pIOld, Standard());
954         TSize runOld = run;
955         for(;pIter != pIterEnd; ++pIter, ++pIterCompare) {
956             if (*pIter > *pIterCompare) {
957                 if (*pIter - *pIterCompare > eps) {
958                     ++run;
959                     break;
960                 }
961             } else {
962                 if (*pIterCompare - *pIter > eps) {
963                     ++run;
964                     break;
965                 }
966             }
967         }
968         if (runOld == run) {
969             pIter = begin(pIJOld, Standard());
970             pIterCompare = begin(pIJ, Standard());
971             pIterEnd = end(pIJOld, Standard());
972             for(;pIter != pIterEnd; ++pIter, ++pIterCompare) {
973                 if (*pIter > *pIterCompare) {
974                     if (*pIter - *pIterCompare > eps) {
975                         ++run;
976                         break;
977                     }
978                 } else {
979                     if (*pIterCompare - *pIter > eps) {
980                         ++run;
981                         break;
982                     }
983                 }
984             }
985         }
986 
987         if (runOld == run) {
988             std::cout << "Iterations: " << run << std::endl;
989             run = 0;
990         }
991     }
992 
993     // Compute the most likely consensus
994     TPosPrDistIter itPosPrDist = begin(posPrDist, Standard());
995     TPosPrDistIter itPosPrDistEnd = end(posPrDist, Standard());
996     clear(gappedConsensus);
997     for(;itPosPrDist!=itPosPrDistEnd; ++itPosPrDist) {
998         TProbability max = 0;
999         TSize ind = 0;
1000         for(TSize consI = 0; consI<(alphabetSize + 1); ++consI) {
1001             if ((*itPosPrDist)[consI] > max) {
1002                 max = (*itPosPrDist)[consI];
1003                 ind = consI;
1004             }
1005         }
1006         if (ind == alphabetSize) appendValue(gappedConsensus, gapChar);
1007         else appendValue(gappedConsensus, TAlphabet(ind));
1008     }
1009 
1010 }
1011 
1012 //////////////////////////////////////////////////////////////////////////////
1013 
1014 
1015 template <typename TFragSpec, typename TConfig, typename TContigId>
1016 inline void
consensusCalling(FragmentStore<TFragSpec,TConfig> & fragStore,TContigId contigId,Bayesian)1017 consensusCalling(FragmentStore<TFragSpec, TConfig>& fragStore,
1018                  TContigId contigId,
1019                  Bayesian)
1020 {
1021     SEQAN_CHECKPOINT
1022 
1023     typedef FragmentStore<TFragSpec, TConfig> TFragmentStore;
1024     typedef typename Size<TFragmentStore>::Type TSize;
1025     typedef typename TFragmentStore::TReadSeq TReadSeq;
1026     typedef typename Value<TReadSeq>::Type TAlphabet;
1027     typedef char TValue;
1028 
1029     // Convert the contig to an alignment matrix
1030     typedef String<TValue> TAlignMat;
1031     TAlignMat mat;
1032     TSize maxCoverage;
1033     convertAlignment(fragStore, mat, contigId, maxCoverage);
1034 
1035     // Call the consensus
1036     String<TValue> gappedConsensus;
1037     consensusCalling(mat, gappedConsensus, maxCoverage, TAlphabet(), Bayesian());
1038 
1039     // Assign the new consensus
1040     assignGappedConsensus(fragStore, gappedConsensus, contigId);
1041 }
1042 
1043 //////////////////////////////////////////////////////////////////////////////
1044 
1045 template <typename TValue, typename TSpec, typename TGappedCons, typename TAlignDepth, typename TAlphabet>
1046 inline void
consensusCalling(String<TValue,TSpec> const & mat,TGappedCons & gappedConsensus,TAlignDepth maxCoverage,TAlphabet,MajorityVote)1047 consensusCalling(String<TValue, TSpec> const& mat,
1048                  TGappedCons& gappedConsensus,
1049                  TAlignDepth maxCoverage,
1050                  TAlphabet,
1051                  MajorityVote)
1052 {
1053     typedef typename Size<String<TValue, TSpec> >::Type TSize;
1054     TSize alphabetSize = ValueSize<TAlphabet>::VALUE;
1055     TValue gapChar = gapValue<TValue>();
1056 
1057     // Set-up the counters
1058     typedef String<TSize> TCounter;
1059     typedef String<TCounter> TCounters;
1060     TCounters counterValues;
1061     _countLetters(mat, counterValues, maxCoverage, TAlphabet() );
1062 
1063     // Get the consensus
1064     typedef typename Iterator<TCounters, Standard>::Type TCounterIt;
1065     TCounterIt countIt = begin(counterValues, Standard());
1066     TCounterIt countItEnd = end(counterValues, Standard());
1067     clear(gappedConsensus);
1068     TSize max = 0;
1069     TValue c = TValue();
1070     TSize pos = 0;
1071     for(;countIt != countItEnd; ++countIt) {
1072         max = 0;
1073         typedef typename Iterator<TCounter, Standard>::Type TCIt;
1074         TCIt cIt = begin(*countIt, Standard());
1075         TCIt cItEnd = end(*countIt, Standard());
1076         pos = 0;
1077         for(;cIt != cItEnd; ++cIt, ++pos) {
1078             if (*cIt > max) {
1079                 max = *cIt;
1080                 c = (pos == alphabetSize) ? gapChar : (TValue) TAlphabet(pos);
1081             }
1082         }
1083         appendValue(gappedConsensus, c);
1084     }
1085 }
1086 
1087 
1088 //////////////////////////////////////////////////////////////////////////////
1089 
1090 
1091 template <typename TFragSpec, typename TConfig, typename TContigId>
1092 inline void
consensusCalling(FragmentStore<TFragSpec,TConfig> & fragStore,TContigId contigId,MajorityVote)1093 consensusCalling(FragmentStore<TFragSpec, TConfig>& fragStore,
1094                  TContigId contigId,
1095                  MajorityVote)
1096 {
1097     typedef FragmentStore<TFragSpec, TConfig> TFragmentStore;
1098     typedef typename Size<TFragmentStore>::Type TSize;
1099     typedef typename TFragmentStore::TReadSeq TReadSeq;
1100     typedef typename Value<TReadSeq>::Type TAlphabet;
1101     typedef char TValue;
1102 
1103     // Convert the contig to an alignment matrix
1104     typedef String<TValue> TAlignMat;
1105     TAlignMat mat;
1106     TSize maxCoverage;
1107     convertAlignment(fragStore, mat, contigId, maxCoverage);
1108 
1109     // Call the consensus
1110     String<TValue> gappedConsensus;
1111     consensusCalling(mat, gappedConsensus, maxCoverage, TAlphabet(), MajorityVote());
1112 
1113     // Assign the new consensus
1114     assignGappedConsensus(fragStore, gappedConsensus, contigId);
1115 }
1116 
1117 
1118 //////////////////////////////////////////////////////////////////////////////
1119 
1120 
1121 //////////////////////////////////////////////////////////////////////////////
1122 // Old proprietary FastaReadFormat
1123 //////////////////////////////////////////////////////////////////////////////
1124 
1125 //////////////////////////////////////////////////////////////////////////////
1126 
1127 struct FastaReadFormat_;
1128 typedef Tag<FastaReadFormat_> const FastaReadFormat;
1129 
1130 //////////////////////////////////////////////////////////////////////////////
1131 
1132 template<typename TFile, typename TSpec, typename TConfig>
1133 inline void
write(TFile & file,FragmentStore<TSpec,TConfig> & fragStore,FastaReadFormat)1134 write(
1135     TFile & file,
1136     FragmentStore<TSpec, TConfig>& fragStore,
1137     FastaReadFormat)
1138 {
1139 //IOREV _nodoc_ what has this got to do with consensus? why is it here?
1140     // Basic types
1141     typedef FragmentStore<TSpec, TConfig> TFragmentStore;
1142     typedef typename Size<TFragmentStore>::Type TSize;
1143     typedef typename TFragmentStore::TContigPos TContigPos;
1144     //typedef typename Value<TFile>::Type TValue;
1145     typedef char TMultiReadChar;
1146     TMultiReadChar gapChar = gapValue<TMultiReadChar>();
1147 
1148     typename DirectionIterator<TFile, Output>::Type target = directionIterator(file, Output());
1149 
1150     typedef typename Iterator<typename TFragmentStore::TContigStore, Standard>::Type TContigIter;
1151     TContigIter contigIt = begin(fragStore.contigStore, Standard() );
1152     TContigIter contigItEnd = end(fragStore.contigStore, Standard() );
1153     for(TSize idCount = 0;contigIt != contigItEnd; ++contigIt, ++idCount) {
1154         // Alignment matrix
1155         typedef String<TMultiReadChar> TAlignMat;
1156         TAlignMat mat;
1157         TSize maxCoverage;
1158         String<TSize> readSlot;
1159         convertAlignment(fragStore, mat, idCount, maxCoverage, readSlot);
1160         TSize len = length(mat) / maxCoverage;
1161 
1162         // Gapped consensus sequence
1163         typedef String<TMultiReadChar> TGappedConsensus;
1164         TGappedConsensus gappedConsensus;
1165         getGappedConsensus(fragStore, gappedConsensus, idCount);
1166 
1167         // Print the alignment matrix
1168         String<TSize> coverage;
1169         resize(coverage, len, 0);
1170         typedef typename Iterator<TGappedConsensus, Standard>::Type TConsIter;
1171         TConsIter itCons = begin(gappedConsensus, Standard());
1172         TSize winSize = 60;
1173         int offset = 2;
1174         TSize column = 0;
1175         while (column<len) {
1176             TSize window_end = column + winSize;
1177             if (window_end >= len) window_end = len;
1178             // Position
1179             for(int i = 0; i<offset - 2; ++i) writeValue(target, ' ');
1180             write(target, "Pos: ");
1181             appendNumber(target, column);
1182             writeValue(target, '\n');
1183             // Ruler
1184             for(int i = 0; i<offset + 3; ++i) writeValue(target, ' ');
1185             for(TSize local_col = 1; local_col<window_end - column + 1; ++local_col) {
1186                 if ((local_col % 10)==0)
1187                     writeValue(target, ':');
1188                 else if ((local_col % 5)==0)
1189                     writeValue(target, '.');
1190                 else
1191                     writeValue(target, ' ');
1192             }
1193             writeValue(target, '\n');
1194             // Matrix
1195             for(TSize row = 0; row<maxCoverage; ++row) {
1196                 TSize tmp = row;
1197                 int off = 0;
1198                 while (tmp / 10 != 0) {
1199                     tmp /= 10;
1200                     ++off;
1201                 }
1202                 for(int i = 0; i<offset - off; ++i) writeValue(target, ' ');
1203                 appendNumber(target, row);
1204                 writeValue(target, ':');
1205                 writeValue(target, ' ');
1206                 for(TSize local_col = column; local_col<window_end; ++local_col) {
1207                     writeValue(target, mat[row * len + local_col]);
1208                     if (mat[row * len + local_col] != '.') ++coverage[local_col];
1209                 }
1210                 writeValue(target, '\n');
1211             }
1212             writeValue(target, '\n');
1213 
1214             // Consensus
1215             for(int i = 0; i<offset; ++i)
1216                 writeValue(target, ' ');
1217             write(target, "C: ");
1218             for(unsigned int local_col = column; local_col<window_end; ++local_col, ++itCons)
1219                 writeValue(target, *itCons);
1220             writeValue(target, '\n');
1221             for(int i = 0; i<offset-1; ++i)
1222                 writeValue(target, ' ');
1223             write(target, ">2: ");
1224             for(unsigned int local_col = column; local_col<window_end; ++local_col) {
1225                 if (coverage[local_col] > 2)
1226                     writeValue(target, gappedConsensus[local_col]);
1227                 else
1228                     writeValue(target, gapChar);
1229             }
1230             writeValue(target, '\n');
1231             writeValue(target, '\n');
1232             column+=winSize;
1233         }
1234         writeValue(target, '\n');
1235         writeValue(target, '\n');
1236 
1237         // Print all aligned reads belonging to this contig
1238 
1239         // Sort according to contigId
1240         sortAlignedReads(fragStore.alignedReadStore, SortContigId());
1241 
1242         // Find range of the given contig
1243         typedef typename Iterator<typename TFragmentStore::TAlignedReadStore, Standard>::Type TAlignIter;
1244         TAlignIter alignIt = lowerBoundAlignedReads(fragStore.alignedReadStore, idCount, SortContigId());
1245         TAlignIter alignItEnd = upperBoundAlignedReads(fragStore.alignedReadStore, idCount, SortContigId());
1246 
1247         // Sort the reads according to the begin position
1248         sortAlignedReads(infix(fragStore.alignedReadStore, alignIt - begin(fragStore.alignedReadStore, Standard()), alignItEnd - begin(fragStore.alignedReadStore, Standard())), SortBeginPos());
1249         TAlignIter alignItTmp = lowerBoundAlignedReads(fragStore.alignedReadStore, idCount, SortContigId());
1250         TAlignIter alignItTmpEnd = upperBoundAlignedReads(fragStore.alignedReadStore, idCount, SortContigId());
1251         String<std::pair<TSize, TSize> > idToPos;
1252         reserve(idToPos, alignItTmpEnd - alignItTmp);
1253         for(TSize iCount = 0; alignItTmp!=alignItTmpEnd; ++iCount, ++alignItTmp)
1254             appendValue(idToPos, std::make_pair(alignItTmp->id, readSlot[iCount]));
1255         std::sort(begin(idToPos, Standard()), end(idToPos, Standard()));
1256 
1257         // Sort the reads according to the id
1258         sortAlignedReads(infix(fragStore.alignedReadStore, alignIt - begin(fragStore.alignedReadStore, Standard()), alignItEnd - begin(fragStore.alignedReadStore, Standard())), SortId());
1259         alignIt = lowerBoundAlignedReads(fragStore.alignedReadStore, idCount, SortContigId());
1260         alignItEnd = upperBoundAlignedReads(fragStore.alignedReadStore, idCount, SortContigId());
1261 
1262         bool noNamesPresent = (length(fragStore.readNameStore) == 0);
1263         for(TSize iCount = 0;alignIt != alignItEnd; ++alignIt, ++iCount) {
1264 
1265             // Print all reads
1266             write(target, "typ:");
1267             if (!noNamesPresent)
1268             {
1269                 writeValue(target, 'R');
1270                 appendNumber(target, iCount);
1271             }
1272             else
1273             {
1274                 write(target, fragStore.readNameStore[alignIt->readId]);
1275             }
1276             write(target, "\nseq:");
1277             write(target, fragStore.readSeqStore[alignIt->readId]);
1278             write(target, "\nPos:");
1279             appendNumber(target, alignIt->beginPos);
1280             writeValue(target, ',');
1281             appendNumber(target, alignIt->endPos);
1282             writeValue(target, '\n');
1283 #ifndef CELERA_OFFSET
1284             TSize begClr = 0;
1285             TSize endClr = 0;
1286             getClrRange(fragStore, *alignIt, begClr, endClr);
1287             write(target, "clr:");
1288             appendNumber(target, begClr);
1289             writeValue(target, ',');
1290             appendNumber(target, endClr);
1291             writeValue(target, '\n');
1292 #endif
1293             std::stringstream gapCoords;
1294             TSize letterCount = 0;
1295             TSize gapCount = 0;
1296             for(TContigPos column = _min(alignIt->beginPos, alignIt->endPos); column < _max(alignIt->beginPos, alignIt->endPos); ++column) {
1297                 if (mat[idToPos[iCount].second * len + column] == gapChar) {
1298                     ++gapCount;
1299                     gapCoords << letterCount << ' ';
1300                 } else ++letterCount;
1301             }
1302             write(target, "dln:");
1303             appendNumber(target, gapCount);
1304             writeValue(target, '\n');
1305             write(target, "del:");
1306             write(target, gapCoords.str());
1307             writeValue(target, '\n');
1308             writeValue(target, '\n');
1309         }
1310     }
1311 }
1312 
1313 //////////////////////////////////////////////////////////////////////////////
1314 // Read simulator format: Simple fasta read file with positions
1315 //////////////////////////////////////////////////////////////////////////////
1316 
1317 
1318 //////////////////////////////////////////////////////////////////////////////////
1319 
1320 
1321 template<typename TFile, typename TSpec, typename TConfig, typename TFilePath>
1322 inline void
_convertSimpleReadFile(TFile &,FragmentStore<TSpec,TConfig> &,TFilePath &,bool)1323 _convertSimpleReadFile(TFile& /*file*/,
1324                        FragmentStore<TSpec, TConfig>& /*fragStore*/,
1325                        TFilePath& /*filePath*/,
1326                        bool /*moveToFront*/)
1327 {
1328 return;
1329 ////IOREV _nodoc_ huge undocumented function, uses custom IO based on iostream and FILE* :S
1330 //    // Basic types
1331 //    typedef FragmentStore<TSpec, TConfig> TFragmentStore;
1332 //    typedef typename Id<TFragmentStore>::Type TId;
1333 //    typedef typename Size<TFragmentStore>::Type TSize;
1334 //    //typedef typename Value<TFile>::Type TValue;
1335 //    typedef typename TFragmentStore::TContigPos TPos;
1336 //    typedef typename TFragmentStore::TReadSeq TReadSeq;
1337 //
1338 //    // All fragment store element types
1339 //    typedef typename Value<typename TFragmentStore::TContigStore>::Type TContigElement;
1340 //    typedef typename Value<typename TFragmentStore::TLibraryStore>::Type TLibraryStoreElement;
1341 //    typedef typename Value<typename TFragmentStore::TMatePairStore>::Type TMatePairElement;
1342 //    typedef typename Value<typename TFragmentStore::TReadStore>::Type TReadStoreElement;
1343 //    typedef typename Value<typename TFragmentStore::TAlignedReadStore>::Type TAlignedElement;
1344 //
1345 //
1346 //    // All maps to mirror file ids to our internal ids
1347 //    typedef std::map<TId, TId> TIdMap;
1348 //    TIdMap libIdMap;
1349 //    TIdMap frgIdMap;
1350 //    TIdMap readIdMap;
1351 //
1352 //    // Create RecordReader object.
1353 //    typename DirectionIterator<TFile, Input>::Type reader = directionIterator(file, Input());
1354 //
1355 //    // Parse the file and convert the internal ids
1356 //    TPos maxPos = 0;
1357 //    TPos minPos = MaxValue<TPos>::VALUE;
1358 //    TId count = 0;
1359 //    if (atEnd(reader))
1360 //        return false;
1361 //    CharString buffer;
1362 //    while (!atEnd(reader))
1363 //    {
1364 //        // New read?
1365 //        if (value(reader) == '>')
1366 //        {
1367 //            TAlignedElement alignEl;
1368 //            TId id = count;
1369 //            TId fragId = TReadStoreElement::INVALID_ID;
1370 //            TId repeatId = 0;
1371 //
1372 //            goNext(reader);
1373 //            if (skipWhitespaces(reader) != 0)
1374 //                return 1;
1375 //
1376 //            // Get the layout positions
1377 //            clear(buffer);
1378 //            if (readDigits(buffer, reader) != 0)
1379 //                return 1;
1380 //            if (!lexicalCast2(alignEl.beginPos, buffer))
1381 //                return 1;
1382 //            goNext(reader);
1383 //            if (skipWhitespaces(reader) != 0)
1384 //                return 1;
1385 //            clear(buffer);
1386 //            if (readDigits(buffer, reader) != 0)
1387 //                return 1;
1388 //            if (!lexicalCast2(alignEl.endPos, buffer))
1389 //                return 1;
1390 //
1391 //            // Any attributes?
1392 //            String<char> eid;
1393 //            String<char> qlt;
1394 //            TReadSeq seq;
1395 //            if (value(reader) == '[')
1396 //            {
1397 //                String<char> fdIdentifier;
1398 //                while (value(reader) != ']')
1399 //                {
1400 //                    goNext(reader);
1401 //                    if (skipWhitespaces(reader) != 0)
1402 //                        return 1;
1403 //                    clear(fdIdentifier);
1404 //                    if (readAlphaNums(fdIdentifier, reader) != 0)
1405 //                        return 1;
1406 //                    goNext(reader);  // Skip "="
1407 //                    if (fdIdentifier == "id")
1408 //                    {
1409 //                        clear(buffer);
1410 //                        if (readDigits(buffer, reader) != 0)
1411 //                            return 1;
1412 //                        if (!lexicalCast2(id, buffer))
1413 //                            return 1;
1414 //                    } else if (fdIdentifier == "fragId") {
1415 //                        clear(buffer);
1416 //                        if (readDigits(buffer, reader) != 0)
1417 //                            return 1;
1418 //                        if (!lexicalCast2(fragId, buffer))
1419 //                            return 1;
1420 //                    } else if (fdIdentifier == "repeatId") {
1421 //                        clear(buffer);
1422 //                        if (readDigits(buffer, reader) != 0)
1423 //                            return 1;
1424 //                        if (!lexicalCast2(repeatId, buffer))
1425 //                            return 1;
1426 //                    } else if (fdIdentifier == "eid") {
1427 //                        if (readUntilOneOf(eid, reader, ',', ']') != 0)
1428 //                            return 1;
1429 //                    } else if (fdIdentifier == "qlt") {
1430 //                        if (readUntilOneOf(qlt, reader, ',', ']') != 0)
1431 //                            return 1;
1432 //                    } else {
1433 //                        // Jump to next attribute
1434 //                        // TODO(holtgrew): Add skipUntilOneOf()?
1435 //                        if (readUntilOneOf(buffer, reader, ',', ']') != 0)
1436 //                            return 1;
1437 //                    }
1438 //                }
1439 //            }
1440 //            if (skipLine(reader) != 0)
1441 //                return 1;
1442 //            if (skipWhitespaces(reader) != 0)
1443 //                return 1;
1444 //            while (!atEnd(reader) && value(reader) != '>')
1445 //            {
1446 //                if (readLetters(seq, reader) != 0)
1447 //                    return 1;
1448 //                int res = skipWhitespaces(reader);
1449 //                if (res != 0 && res != EOF_BEFORE_SUCCESS)
1450 //                    return 1;
1451 //            }
1452 //
1453 //            // Set quality
1454 //            typedef typename Iterator<TReadSeq, Standard>::Type TReadIter;
1455 //            typedef typename Iterator<String<char> >::Type TQualIter;
1456 //            TReadIter begIt = begin(seq, Standard() );
1457 //            TReadIter begItEnd = begin(seq, Standard() );
1458 //            if (length(qlt)) {
1459 //                TQualIter qualIt = begin(qlt);
1460 //                TQualIter qualItEnd = end(qlt);
1461 //                for(;qualIt != qualItEnd; goNext(qualIt), goNext(begIt)) assignQualityValue(value(begIt), value(qualIt));
1462 //            } else {
1463 //                for(;begIt != begItEnd; goNext(begIt)) assignQualityValue(value(begIt), 'D');
1464 //            }
1465 //
1466 //            // Set eid if not given
1467 //            if (empty(eid)) {
1468 //                std::stringstream input;
1469 //                input << "R" << id;
1470 //                input << "-" << repeatId;
1471 //                eid = input.str().c_str();
1472 //            }
1473 //
1474 //            // Insert the read
1475 //            readIdMap.insert(std::make_pair(id, static_cast<TId>(length(fragStore.readStore))));
1476 //            appendRead(fragStore, seq, fragId);
1477 //            appendValue(fragStore.readNameStore, eid, Generous());
1478 //
1479 //            // Insert an aligned read
1480 //            TSize readLen = length(seq);
1481 //            if (alignEl.beginPos < alignEl.endPos) {
1482 //                if ((TPos)readLen != alignEl.endPos - alignEl.beginPos) {
1483 //                    alignEl.endPos = alignEl.beginPos + readLen;
1484 //                }
1485 //                if (alignEl.beginPos < minPos) minPos = alignEl.beginPos;
1486 //                if (alignEl.endPos > maxPos) maxPos = alignEl.endPos;
1487 //            } else {
1488 //                if ((TPos)readLen != alignEl.beginPos - alignEl.endPos) {
1489 //                    alignEl.beginPos = alignEl.endPos + readLen;
1490 //                }
1491 //                if (alignEl.endPos < minPos) minPos = alignEl.endPos;
1492 //                if (alignEl.beginPos > maxPos) maxPos = alignEl.beginPos;
1493 //            }
1494 //            alignEl.readId = id;
1495 //            alignEl.pairMatchId =  fragId;
1496 //            alignEl.contigId = 0;
1497 //            alignEl.id = length(fragStore.alignedReadStore);
1498 //            appendValue(fragStore.alignedReadStore, alignEl, Generous());
1499 //            ++count;
1500 //        } else {
1501 //            if (skipLine(reader) != 0)
1502 //                return 1;
1503 //        }
1504 //    }
1505 //
1506 //    // Read contig or reference sequence
1507 //    TContigElement contigEl;
1508 //    std::string fileName = filePath + 'S';
1509 //    SeqFileIn strmRef;
1510 //    String<char> contigEid = "C0";
1511 //    if (open(strmRef, fileName.c_str()))
1512 //    {
1513 //        typename DirectionIterator<TFile, Input>::Type readerRef = directionIterator(strmRef, Input());
1514 //        clear(contigEid);
1515 //        readRecord(contigEid, contigEl.seq, strmRef);
1516 //        close(strmRef);
1517 //    }
1518 //    if (empty(contigEl.seq)) {
1519 //        typedef typename TFragmentStore::TContigGapAnchor TContigGapAnchor;
1520 //        if (moveToFront) appendValue(contigEl.gaps, TContigGapAnchor(0, maxPos - minPos), Generous());
1521 //        else appendValue(contigEl.gaps, TContigGapAnchor(0, maxPos), Generous());
1522 //    }
1523 //    appendValue(fragStore.contigStore, contigEl, Generous());
1524 //    appendValue(fragStore.contigNameStore, contigEid, Generous());
1525 //
1526 //
1527 //    // Read fragments
1528 //    fileName = filePath + 'F';
1529 //    FILE* strmFrag = fopen(fileName.c_str(), "rb");
1530 //    if (!strmFrag)
1531 //        return 1;
1532 //    RecordReader<FILE *, SinglePass<> > readerFrag(strmFrag);
1533 //    while (!atEnd(readerFrag))
1534 //    {
1535 //        if (value(readerFrag) == '>')
1536 //        {
1537 //            TMatePairElement matePairEl;
1538 //            goNext(readerFrag);
1539 //            if (skipWhitespaces(readerFrag) != 0)
1540 //                return 1;
1541 //
1542 //            // Get the fragment id
1543 //            clear(buffer);
1544 //            if (readDigits(buffer, readerFrag) != 0)
1545 //                return 1;
1546 //            TId id = 0;
1547 //            if (!lexicalCast2(id, buffer))
1548 //                return 1;
1549 //
1550 //            // Any attributes?
1551 //            std::stringstream input;
1552 //            input << "F" << id;
1553 //            String<char> eid(input.str().c_str());
1554 //            if (value(readerFrag) == '[')
1555 //            {
1556 //                String<char> fdIdentifier;
1557 //                while (value(readerFrag) != ']')
1558 //                {
1559 //                    goNext(readerFrag);
1560 //                    if (skipWhitespaces(readerFrag) != 0)
1561 //                        return 1;
1562 //                    clear(fdIdentifier);
1563 //                    if (readAlphaNums(fdIdentifier, readerFrag) != 0)
1564 //                        return 1;
1565 //                    goNext(readerFrag);
1566 //                    if (fdIdentifier == "libId")
1567 //                    {
1568 //                        clear(buffer);
1569 //                        if (readDigits(buffer, readerFrag) != 0)
1570 //                            return 1;
1571 //                        if (!lexicalCast2(matePairEl.libId, buffer))
1572 //                            return 1;
1573 //                    } else if (fdIdentifier == "eid") {
1574 //                        clear(eid);
1575 //                        if (readUntilOneOf(eid, readerFrag, ',', ']') != 0)
1576 //                            return 1;
1577 //                    } else {
1578 //                        // Jump to next attribute
1579 //                        if (readUntilOneOf(buffer, readerFrag, ',', ']') != 0)
1580 //                            return 1;
1581 //                    }
1582 //                }
1583 //            }
1584 //            if (skipLine(readerFrag) != 0)
1585 //                return 1;
1586 //
1587 //            // Read the two reads belonging to this mate pair
1588 //            clear(buffer);
1589 //            for (int i = 0; i < 2; ++i)
1590 //            {
1591 //                if (skipWhitespaces(readerFrag) != 0)
1592 //                    return 1;
1593 //                clear(buffer);
1594 //                if (readDigits(buffer, readerFrag) != 0)
1595 //                    return 1;
1596 //                if (!lexicalCast2(matePairEl.readId[i], buffer))
1597 //                    return 1;
1598 //                if (!i)  // Skip ','.
1599 //                    goNext(readerFrag);
1600 //            }
1601 //            int res = skipLine(readerFrag);
1602 //            if (res != 0 && res != EOF_BEFORE_SUCCESS)
1603 //                return 1;
1604 //
1605 //            // Insert mate pair
1606 //            if (matePairEl.readId[0] != matePairEl.readId[1]) {
1607 //                frgIdMap.insert(std::make_pair(id, static_cast<TId>(length(fragStore.matePairStore))));
1608 //                appendValue(fragStore.matePairStore, matePairEl, Generous());
1609 //                appendValue(fragStore.matePairNameStore, eid, Generous());
1610 //            }
1611 //        } else {
1612 //            int res = skipLine(readerFrag);
1613 //            if (res != 0 && res != EOF_BEFORE_SUCCESS)
1614 //                return 1;
1615 //        }
1616 //    }
1617 //    fclose(strmFrag);
1618 //
1619 //    // Read libraries
1620 //    fileName = filePath + 'L';
1621 //    FILE* strmLib = fopen(fileName.c_str(), "rb");
1622 //    if (!strmLib)
1623 //        return 1;
1624 //    RecordReader<FILE *, SinglePass<> > readerLib(strmLib);
1625 //    while (!atEnd(readerLib))
1626 //    {
1627 //        if (value(readerLib) == '>')
1628 //        {
1629 //            TLibraryStoreElement libEl;
1630 //            goNext(readerLib);
1631 //            if (skipWhitespaces(readerLib) != 0)
1632 //                return 1;
1633 //
1634 //            // Get the fragment id
1635 //            clear(buffer);
1636 //            if (readDigits(buffer, readerLib) != 0)
1637 //                return 1;
1638 //            TId id = 0;
1639 //            if (!lexicalCast2(id, buffer))
1640 //                return 1;
1641 //
1642 //            // Any attributes?
1643 //            std::stringstream input;
1644 //            input << "L" << id;
1645 //            String<char> eid(input.str().c_str());
1646 //            if (value(readerLib) == '[')
1647 //            {
1648 //                String<char> fdIdentifier;
1649 //                while (value(readerLib) != ']')
1650 //                {
1651 //                    goNext(readerLib);
1652 //                    if (skipWhitespaces(readerLib) != 0)
1653 //                        return 1;
1654 //                    clear(fdIdentifier);
1655 //                    if (readAlphaNums(fdIdentifier, readerLib) != 0)
1656 //                        return 1;
1657 //                    if (fdIdentifier == "eid")
1658 //                    {
1659 //                        clear(eid);
1660 //                        if (readUntilOneOf(eid, readerLib, ',', ']') != 0)
1661 //                            return 1;
1662 //                    } else {
1663 //                        // Jump to next attribute
1664 //                        // TODO(holtgrew): skipUntilOneOf()?
1665 //                        if (readUntilOneOf(buffer, readerLib, ',', ']') != 0)
1666 //                            return 1;
1667 //                    }
1668 //                }
1669 //            }
1670 //            if (skipLine(readerLib) != 0)
1671 //                return 1;
1672 //            if (skipWhitespaces(readerLib) != 0)
1673 //                return 1;
1674 //
1675 //            // Read the mean and standard deviation
1676 //            clear(buffer);
1677 //            if (readDigits(buffer, readerLib) != 0)
1678 //                return 1;
1679 //            if (!lexicalCast2(libEl.mean, buffer))
1680 //                return 1;
1681 //            if (skipWhitespaces(readerLib) != 0)
1682 //                return 1;
1683 //            goNext(readerLib);
1684 //            clear(buffer);
1685 //            if (readDigits(buffer, readerLib) != 0)
1686 //                return 1;
1687 //            if (!lexicalCast2(libEl.std, buffer))
1688 //                return 1;
1689 //            int res = skipLine(readerLib);
1690 //            if (res != 0 && res != EOF_BEFORE_SUCCESS)
1691 //                return 1;
1692 //
1693 //            // Insert mate pair
1694 //            libIdMap.insert(std::make_pair(id, static_cast<TId>(length(fragStore.libraryStore))));
1695 //            appendValue(fragStore.libraryStore, libEl, Generous());
1696 //            appendValue(fragStore.libraryNameStore, eid, Generous());
1697 //        } else {
1698 //            int res = skipLine(readerLib);
1699 //            if (res != 0 && res != EOF_BEFORE_SUCCESS)
1700 //                return 1;
1701 //        }
1702 //    }
1703 //    fclose(strmLib);
1704 //
1705 //    // Renumber all ids
1706 //    typedef typename TIdMap::const_iterator TIdMapIter;
1707 //    typedef typename Iterator<typename TFragmentStore::TMatePairStore>::Type TMateIter;
1708 //    TMateIter mateIt = begin(fragStore.matePairStore);
1709 //    TMateIter mateItEnd = end(fragStore.matePairStore);
1710 //    for(;mateIt != mateItEnd; goNext(mateIt)) {
1711 //        if (mateIt->libId != TMatePairElement::INVALID_ID) {
1712 //            TIdMapIter libIdPos = libIdMap.find(mateIt->libId);
1713 //            if (libIdPos != libIdMap.end()) mateIt->libId = libIdPos->second;
1714 //            else mateIt->libId = TMatePairElement::INVALID_ID;
1715 //        }
1716 //        if (mateIt->readId[0] != TMatePairElement::INVALID_ID) {
1717 //            TIdMapIter readIdPos = readIdMap.find(mateIt->readId[0]);
1718 //            if (readIdPos != readIdMap.end()) mateIt->readId[0] = readIdPos->second;
1719 //            else mateIt->readId[0] = TMatePairElement::INVALID_ID;
1720 //        }
1721 //        if (mateIt->readId[1]!= TMatePairElement::INVALID_ID) {
1722 //            TIdMapIter readIdPos = readIdMap.find(mateIt->readId[1]);
1723 //            if (readIdPos != readIdMap.end()) mateIt->readId[1] = readIdPos->second;
1724 //            else mateIt->readId[0] = TMatePairElement::INVALID_ID;
1725 //        }
1726 //    }
1727 //    typedef typename Iterator<typename TFragmentStore::TReadStore>::Type TReadIter;
1728 //    TReadIter readIt = begin(fragStore.readStore);
1729 //    TReadIter readItEnd = end(fragStore.readStore);
1730 //    for(;readIt != readItEnd; goNext(readIt)) {
1731 //        if (readIt->matePairId != TReadStoreElement::INVALID_ID) {
1732 //            TIdMapIter mateIdPos = frgIdMap.find(readIt->matePairId);
1733 //            if (mateIdPos != frgIdMap.end()) readIt->matePairId = mateIdPos->second;
1734 //            else readIt->matePairId = TReadStoreElement::INVALID_ID;
1735 //        }
1736 //    }
1737 //    typedef typename Iterator<typename TFragmentStore::TAlignedReadStore>::Type TAlignIter;
1738 //    TAlignIter alignIt = begin(fragStore.alignedReadStore);
1739 //    TAlignIter alignItEnd = end(fragStore.alignedReadStore);
1740 //    for(;alignIt != alignItEnd; goNext(alignIt)) {
1741 //        if (alignIt->readId != TAlignedElement::INVALID_ID) {
1742 //            TIdMapIter readIdPos = readIdMap.find(alignIt->readId);
1743 //            if (readIdPos != readIdMap.end()) alignIt->readId = readIdPos->second;
1744 //            else alignIt->readId = TAlignedElement::INVALID_ID;
1745 //        }
1746 //        if (moveToFront) {
1747 //            alignIt->beginPos -= minPos;
1748 //            alignIt->endPos -= minPos;
1749 //        }
1750 //    }
1751 //    return 0;
1752 }
1753 
1754 
1755 //////////////////////////////////////////////////////////////////////////////
1756 // Rudimentary write functions for CeleraFrg and Celera Cgb
1757 //////////////////////////////////////////////////////////////////////////////
1758 
1759 template<typename TFile, typename TSpec, typename TConfig>
1760 inline void
_writeCeleraFrg(TFile & file,FragmentStore<TSpec,TConfig> & fragStore)1761 _writeCeleraFrg(TFile& file,
1762                 FragmentStore<TSpec, TConfig>& fragStore)
1763 {
1764 //IOREV _nodoc_
1765     typedef FragmentStore<TSpec, TConfig> TFragmentStore;
1766     typedef typename Size<TFragmentStore>::Type TSize;
1767     typedef typename TFragmentStore::TReadPos TReadPos;
1768 
1769     typename DirectionIterator<TFile, Output>::Type target = directionIterator(file, Output());
1770 
1771     // Iterate over all aligned reads to get the clear ranges
1772     typedef Pair<TReadPos, TReadPos> TClrRange;
1773     String<TClrRange> clearStr;
1774     resize(clearStr, length(fragStore.readStore));
1775     typedef typename Iterator<typename TFragmentStore::TAlignedReadStore>::Type TAlignIter;
1776     TAlignIter alignIt = begin(fragStore.alignedReadStore);
1777     TAlignIter alignItEnd = end(fragStore.alignedReadStore);
1778     for(;alignIt != alignItEnd; goNext(alignIt))
1779     {
1780         TReadPos begClr = 0;
1781         TReadPos endClr = 0;
1782         getClrRange(fragStore, value(alignIt), begClr, endClr);
1783         value(clearStr, alignIt->readId) = TClrRange(begClr, endClr);
1784     }
1785 
1786     // Write Reads
1787     typedef typename Iterator<typename TFragmentStore::TReadStore>::Type TReadIter;
1788     TReadIter readIt = begin(fragStore.readStore);
1789     TReadIter readItEnd = end(fragStore.readStore);
1790     bool noNamesPresent = (length(fragStore.readNameStore) == 0);
1791     for (TSize idCount = 0; readIt != readItEnd; goNext(readIt), ++idCount)
1792     {
1793         write(target, "{FRG\nact:A\nacc:");
1794         appendNumber(target, idCount + 1);
1795         write(target, "\ntyp:R\n");
1796         if (!noNamesPresent)
1797         {
1798             write(target, "src:\n");
1799             write(target, fragStore.readNameStore[idCount]);
1800             write(target, "\n.\n");
1801         }
1802         write(target, "etm:0\nseq:\n");
1803         writeWrappedString(target, fragStore.readSeqStore[idCount], 70);
1804         write(target, ".\nqlt:\n");
1805         typedef typename Value<typename TFragmentStore::TReadSeqStore>::Type TReadSeq;
1806         typedef QualityExtractor<typename Value<TReadSeq>::Type> TQualityExtractor;
1807         ModifiedString<TReadSeq const, ModView<TQualityExtractor> > quals(fragStore.readSeqStore[idCount]);
1808         writeWrappedString(target, quals, 70);
1809         // Note: Clear range does not have to be ordered, e.g. no indication for reverse complemented reads, this is happening in cgb records
1810         write(target, ".\nclr:");
1811         appendNumber(target, clearStr[idCount].i1);
1812         writeValue(target, ',');
1813         appendNumber(target, clearStr[idCount].i2);
1814         write(target, "\n}\n");
1815     }
1816 }
1817 
1818 
1819 //////////////////////////////////////////////////////////////////////////////
1820 
1821 // Writes out the first contig only.
1822 
1823 // TODO(holtgrew): This is only used in seqcons, should it go into app?
1824 template<typename TFile, typename TSpec, typename TConfig>
1825 inline void
_writeCeleraCgb(TFile & file,FragmentStore<TSpec,TConfig> & fragStore)1826 _writeCeleraCgb(TFile& file,
1827                 FragmentStore<TSpec, TConfig>& fragStore)
1828 {
1829     typedef FragmentStore<TSpec, TConfig> TFragmentStore;
1830     typedef typename Size<TFragmentStore>::Type TSize;
1831     typedef typename Id<TFragmentStore>::Type TId;
1832     //typedef typename TFragmentStore::TReadPos TReadPos;
1833 
1834     typename DirectionIterator<TFile, Output>::Type target = directionIterator(file, Output());
1835 
1836     // Write the first contig
1837     TId contigId = 0;
1838 
1839     // Sort the reads according to position
1840     sortAlignedReads(fragStore.alignedReadStore, SortBeginPos());
1841 
1842     // Write Header
1843     write(target, "{IUM\nacc:0\nsrc:\ngen> @@ [0,0]\n.\ncov:0.000\nsta:X\nfur:X\nabp:0\nbbp:0\nlen:");
1844     appendNumber(target, length((value(fragStore.contigStore, contigId)).seq));
1845     write(target, "\ncns:\n.\nqlt:\n.\nfor:0\nnfr:");
1846     appendNumber(target, length(fragStore.readStore));
1847     writeValue(target, '\n');
1848 
1849     // Write reads
1850     typedef typename Iterator<typename TFragmentStore::TAlignedReadStore>::Type TAlignIter;
1851     TAlignIter alignIt = begin(fragStore.alignedReadStore);
1852     TAlignIter alignItEnd = end(fragStore.alignedReadStore);
1853     TSize offsetLeft = _min(alignIt->beginPos, alignIt->endPos);
1854     for (;alignIt != alignItEnd; goNext(alignIt))
1855     {
1856         if (contigId != alignIt->contigId)
1857             continue;
1858         write(target, "{IMP\ntyp:R\nmid:");
1859         appendNumber(target, alignIt->readId + 1);
1860         write(target, "\ncon:0\npos:");
1861         appendNumber(target, alignIt->beginPos - offsetLeft);
1862         writeValue(target, ',');
1863         appendNumber(target, alignIt->endPos - offsetLeft);
1864         write(target, "\ndln:0\ndel:\n}\n");
1865     }
1866     write(target, "}\n");
1867 }
1868 
1869 }  // namespace seqan
1870 
1871 #endif //#ifndef SEQAN_HEADER_...
1872