1 // ==========================================================================
2 //                 SeqAn - The Library for Sequence Analysis
3 // ==========================================================================
4 // Copyright (c) 2006-2018, Knut Reinert, FU Berlin
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are met:
9 //
10 //     * Redistributions of source code must retain the above copyright
11 //       notice, this list of conditions and the following disclaimer.
12 //     * Redistributions in binary form must reproduce the above copyright
13 //       notice, this list of conditions and the following disclaimer in the
14 //       documentation and/or other materials provided with the distribution.
15 //     * Neither the name of Knut Reinert or the FU Berlin nor the names of
16 //       its contributors may be used to endorse or promote products derived
17 //       from this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29 // DAMAGE.
30 //
31 // ==========================================================================
32 // Author: Tobias Rausch <rausch@embl.de>
33 // ==========================================================================
34 
35 #ifndef SEQAN_HEADER_CONSENSUS_BASE_H
36 #define SEQAN_HEADER_CONSENSUS_BASE_H
37 
38 namespace seqan
39 {
40 
41 
42 //////////////////////////////////////////////////////////////////////////////
43 // Segment Match Generation tag
44 //////////////////////////////////////////////////////////////////////////////
45 
46 // TODO(holtgrew): This apparently belongs into graph_msa?
47 
48 struct OverlapLibrary_;
49 typedef Tag<OverlapLibrary_> const OverlapLibrary;
50 
51 
52 
53 //////////////////////////////////////////////////////////////////////////////
54 // Consensus tag
55 //////////////////////////////////////////////////////////////////////////////
56 
57 /*
58  * @defgroup ConsensusCallingTags Consensus Calling Tags
59  * @brief Tags for consensus calling.
60  *
61  * @tag ConsensusCallingTags#MajorityVote
62  * @headerfile <seqan/consensus.h>
63  * @brief Consensus based on the most common character.
64  *
65  * @tag ConsensusCallingTags#Bayesian
66  * @headerfile <seqan/consensus.h>
67  * @brief Consensus based on a bayesian probability.
68  */
69 
70 
71 struct MajorityVote_;
72 typedef Tag<MajorityVote_> const MajorityVote;
73 
74 struct Bayesian_;
75 typedef Tag<Bayesian_> const Bayesian;
76 
77 
78 
79 //////////////////////////////////////////////////////////////////////////////
80 // Read alignment and Consensus Generation
81 //////////////////////////////////////////////////////////////////////////////
82 
83 
84 //////////////////////////////////////////////////////////////////////////////////
85 
86 struct ConsensusOptions {
87 public:
88     // Method
89     // 0: graph-based multiple sequence alignment
90     // 1: realign
91     int method;
92 
93     // ReAlign Method
94     // 0: Needleman-Wunsch
95     // 1: Gotoh
96     int rmethod;
97 
98     // Bandwidth of overlap alignment
99     int bandwidth;
100 
101     // Number of computed overlaps per read (at the beginning and end of a read)
102     int overlaps;
103 
104     // Minimum match length of a computed overlap
105     int matchlength;
106 
107     // Minimum quality (in percent identity) of a computed overlap
108     int quality;
109 
110     // Window size, only relevant for insert sequencing
111     // If window == 0, no insert sequencing is assumed
112     int window;
113 
114     // Output
115     // 0: seqan style
116     // 1: afg output format
117     // 2: frg output format
118     // 3: cgb output format
119     // 4: Sam output format
120     int output;
121 
122     // Multi-read alignment
123     bool noalign;
124 
125     // Offset all reads, so the first read starts at position 0
126     bool moveToFront;
127 
128     // Include reference genome
129     bool include;
130 
131     // Scoring object for overlap alignments
132     Score<int> sc;
133 
134     // Various input and output files
135     std::string readsfile;                // File of reads in FASTA format
136     std::string afgfile;                // AMOS afg file input
137     std::string samfile;                // Sam file input
138     std::string contigsfile;            // FASTA reference file for Sam input
139     std::string outfile;                // Output file name
140 
141     // Initialization
ConsensusOptionsConsensusOptions142     ConsensusOptions() :
143         method(0), rmethod(0), bandwidth(0), overlaps(0), matchlength(0), quality(0),
144         window(0), output(0), noalign(false), moveToFront(0), include(0), sc(2, -6, -4, -9)
145     {}
146 };
147 
148 
149 //////////////////////////////////////////////////////////////////////////////////
150 
151 // Copy reads for the whole contig out of fragStore and into strSet.  The start and end
152 // positions of the alignments go into startEndPos.
153 
154 template<typename TValue, typename TStrSpec, typename TPosPair, typename TStringSpec, typename TSpec, typename TConfig, typename TId>
155 inline void
_loadContigReads(StringSet<TValue,Owner<TStrSpec>> & strSet,String<TPosPair,TStringSpec> & startEndPos,FragmentStore<TSpec,TConfig> const & fragStore,TId const contigId)156 _loadContigReads(StringSet<TValue, Owner<TStrSpec> > & strSet,
157                  String<TPosPair, TStringSpec> & startEndPos,
158                  FragmentStore<TSpec, TConfig> const & fragStore,
159                  TId const contigId)
160 {
161     typedef FragmentStore<TSpec, TConfig> const TFragmentStore;
162     typedef typename Size<TFragmentStore>::Type TSize;
163     typedef typename TFragmentStore::TReadPos TReadPos;
164 
165 
166     // Sort aligned reads according to contig id
167     sortAlignedReads(fragStore.alignedReadStore, SortContigId());
168     resize(strSet, length(fragStore.alignedReadStore));
169 
170     // Retrieve all reads, limit them to the clear range and if required reverse complement them
171     typedef typename Iterator<typename TFragmentStore::TAlignedReadStore const>::Type TAlignIter;
172     TAlignIter alignIt = lowerBoundAlignedReads(fragStore.alignedReadStore, contigId, SortContigId());
173     TAlignIter alignItEnd = upperBoundAlignedReads(fragStore.alignedReadStore, contigId, SortContigId());
174     TSize numRead = 0;
175     TReadPos begClr = 0;
176     TReadPos endClr = 0;
177     TSize lenRead = 0;
178     TSize offset = 0;
179     for(;alignIt != alignItEnd; ++alignIt) {
180         offset = _min(alignIt->beginPos, alignIt->endPos);
181         getClrRange(fragStore, *alignIt, begClr, endClr);
182         strSet[numRead] = infix(fragStore.readSeqStore[alignIt->readId], begClr, endClr);
183         lenRead = endClr - begClr;
184         if (alignIt->beginPos < alignIt->endPos) appendValue(startEndPos, TPosPair(offset, offset + lenRead), Generous());
185         else {
186             reverseComplement(strSet[numRead]);
187             appendValue(startEndPos, TPosPair(offset + lenRead, offset), Generous());
188         }
189         ++numRead;
190     }
191     resize(strSet, numRead, Exact());
192 }
193 
194 
195 
196 
197 //////////////////////////////////////////////////////////////////////////////
198 
199 template<typename TSpec, typename TConfig, typename TMatrix, typename TSize2, typename TSize, typename TReadSlot>
200 inline bool
convertAlignment(FragmentStore<TSpec,TConfig> & fragStore,TMatrix & mat,TSize2 contigId,TSize & coverage,TReadSlot & slot)201 convertAlignment(FragmentStore<TSpec, TConfig>& fragStore,
202                  TMatrix& mat,
203                  TSize2 contigId,
204                  TSize& coverage,
205                  TReadSlot& slot)
206 {
207     typedef FragmentStore<TSpec, TConfig> TFragmentStore;
208     typedef typename Value<TMatrix>::Type TValue;
209     typedef typename TFragmentStore::TContigPos TContigPos;
210 
211     // Gap char
212     TValue gapChar = gapValue<TValue>();
213 
214     // Sort according to contigId
215     sortAlignedReads(fragStore.alignedReadStore, SortContigId());
216 
217     // Find range of the given contig
218     typedef typename Iterator<typename TFragmentStore::TAlignedReadStore, Standard>::Type TAlignIter;
219     TAlignIter alignIt = lowerBoundAlignedReads(fragStore.alignedReadStore, contigId, SortContigId());
220     TAlignIter alignItEnd = upperBoundAlignedReads(fragStore.alignedReadStore, contigId, SortContigId());
221 
222     // Sort the reads according to the begin position
223     sortAlignedReads(infix(fragStore.alignedReadStore, alignIt - begin(fragStore.alignedReadStore, Standard()), alignItEnd - begin(fragStore.alignedReadStore, Standard())), SortBeginPos());
224     TAlignIter alignItBegin = alignIt = lowerBoundAlignedReads(fragStore.alignedReadStore, contigId, SortContigId());
225     alignItEnd = upperBoundAlignedReads(fragStore.alignedReadStore, contigId, SortContigId());
226 
227     // Get the maximum coverage and the slot for each read
228     typedef String<TSize> TFirstFreePos;
229     typedef typename Iterator<TFirstFreePos, Standard>::Type TPosIter;
230     TFirstFreePos freePos;
231     TSize pos = 0;
232     TSize maxTmp = 0;
233     TSize numCol = 0;
234     reserve(slot, alignItEnd - alignIt);
235     for(;alignIt != alignItEnd; ++alignIt) {
236         TPosIter itPos = begin(freePos, Standard());
237         TPosIter itPosEnd = end(freePos, Standard());
238         pos = 0;
239         for(;itPos != itPosEnd; ++itPos, ++pos)
240             if ((TContigPos)*itPos < _min(alignIt->beginPos, alignIt->endPos)) break;
241         if (pos + 1 > length(freePos)) resize(freePos, pos+1, Generous());
242         maxTmp = _max(alignIt->beginPos, alignIt->endPos);
243         freePos[pos] = maxTmp;
244         if (maxTmp > numCol) numCol = maxTmp;
245         appendValue(slot, pos);
246     }
247     coverage = length(freePos);
248     clear(freePos);
249 
250     // Fill the matrix
251     typedef typename Iterator<TMatrix, Standard>::Type TMatIter;
252     resize(mat, coverage * numCol, '.');
253     alignIt = alignItBegin;
254     TSize readPos = 0;
255     TMatIter matIt = begin(mat, Standard());
256     typename TFragmentStore::TReadSeq myRead;
257     for(;alignIt != alignItEnd; ++alignIt, ++readPos) {
258         typedef typename Iterator<String<typename TFragmentStore::TReadGapAnchor>, Standard>::Type TReadGapsIter;
259         TReadGapsIter itGaps = begin(alignIt->gaps, Standard());
260         TReadGapsIter itGapsEnd = end(alignIt->gaps, Standard());
261 
262         // Place each read inside the matrix
263         myRead = fragStore.readSeqStore[alignIt->readId];
264         TSize lenRead = length(myRead);
265         TSize offset = alignIt->beginPos;
266         if (alignIt->beginPos > alignIt->endPos) {
267             reverseComplement(myRead);
268             offset = alignIt->endPos;
269         }
270         matIt = begin(mat, Standard());
271         matIt += (slot[readPos] * numCol + offset);
272 
273         typedef typename Iterator<typename TFragmentStore::TReadSeq, Standard>::Type TReadIter;
274         TReadIter seqReadIt = begin(myRead, Standard());
275 
276         // First clear range
277         TSize mySeqPos = 0;
278         int diff = 0;
279         if ((itGaps != itGapsEnd) && (itGaps->gapPos == 0)) {
280             mySeqPos = itGaps->seqPos;
281             diff = -1 * mySeqPos;
282             seqReadIt += mySeqPos;
283         }
284         TSize clr2 = lenRead;
285         TSize stop = 0;
286         for(;itGaps != itGapsEnd; ++itGaps) {
287             // Any clipped sequence at the end
288             stop =  itGaps->seqPos;
289             if (diff - ((int) itGaps->gapPos - (int) itGaps->seqPos) > 0)
290                 clr2 = stop = lenRead - (diff - ((int) itGaps->gapPos - (int) itGaps->seqPos));
291 
292             for(;mySeqPos < stop; ++matIt, ++seqReadIt, ++mySeqPos)
293                 *matIt = *seqReadIt;
294 
295             for(int i = 0; i < ((int) itGaps->gapPos - (int) itGaps->seqPos) - diff; ++i, ++matIt)
296                 *matIt = gapChar;
297 
298             diff = (itGaps->gapPos - itGaps->seqPos);
299         }
300         for(;mySeqPos < clr2; ++mySeqPos, ++seqReadIt, ++matIt)
301             *matIt = *seqReadIt;
302     }
303     //for(TSize row = 0; row < coverage; ++row) {
304     //    for(TSize col = 0; col<numCol; ++col) {
305     //        std::cout << mat[row * numCol + col];
306     //    }
307     //    std::cout << std::endl;
308     //}
309     return true;
310 }
311 
312 //////////////////////////////////////////////////////////////////////////////
313 
314 template<typename TSpec, typename TConfig, typename TMatrix, typename TSize2, typename TSize>
315 inline bool
convertAlignment(FragmentStore<TSpec,TConfig> & fragStore,TMatrix & mat,TSize2 contigId,TSize & coverage)316 convertAlignment(FragmentStore<TSpec, TConfig>& fragStore,
317                  TMatrix& mat,
318                  TSize2 contigId,
319                  TSize& coverage)
320 {
321     String<TSize> slot;
322     return convertAlignment(fragStore, mat, contigId, coverage, slot);
323 }
324 
325 //////////////////////////////////////////////////////////////////////////////
326 
327 template<typename TSpec, typename TConfig, typename TMatrix>
328 inline bool
convertAlignment(FragmentStore<TSpec,TConfig> & fragStore,TMatrix & mat)329 convertAlignment(FragmentStore<TSpec, TConfig>& fragStore,
330                  TMatrix& mat)
331 {
332     typedef FragmentStore<TSpec, TConfig> TFragmentStore;
333     typedef typename Size<TFragmentStore>::Type TSize;
334     TSize coverage;
335     return convertAlignment(fragStore, mat, 0, coverage);
336 }
337 
338 //////////////////////////////////////////////////////////////////////////////
339 
340 template<typename TSpec, typename TConfig, typename TGappedConsensus, typename TSize>
341 inline void
getGappedConsensus(FragmentStore<TSpec,TConfig> & fragStore,TGappedConsensus & gappedConsensus,TSize contigId)342 getGappedConsensus(FragmentStore<TSpec, TConfig>& fragStore,
343                    TGappedConsensus& gappedConsensus,
344                    TSize contigId)
345 {
346     typedef FragmentStore<TSpec, TConfig> TFragmentStore;
347     typedef typename Value<TGappedConsensus>::Type TValue;
348     typedef typename TFragmentStore::TContigPos TContigPos;
349 
350     TValue gapChar = gapValue<TValue>();
351     typedef typename Iterator<typename TFragmentStore::TContigSeq, Standard>::Type TContigIter;
352     TContigIter seqContigIt = begin(fragStore.contigStore[contigId].seq, Standard());
353     TContigIter seqContigItEnd = end(fragStore.contigStore[contigId].seq, Standard());
354     typedef typename Iterator<String<typename TFragmentStore::TContigGapAnchor>, Standard>::Type TGapsIter;
355     TGapsIter itGaps = begin(fragStore.contigStore[contigId].gaps, Standard());
356     TGapsIter itGapsEnd = end(fragStore.contigStore[contigId].gaps, Standard());
357     int diff = 0;
358     TContigPos mySeqPos = 0;
359     for(;itGaps != itGapsEnd; goNext(itGaps)) {
360         for(;mySeqPos < itGaps->seqPos; ++seqContigIt, ++mySeqPos)
361             appendValue(gappedConsensus, *seqContigIt, Generous());
362 
363         for(int i = 0; i < ((int) itGaps->gapPos - (int) itGaps->seqPos) - diff; ++i)
364             appendValue(gappedConsensus, gapChar, Generous());
365         //NOTE(h-2): the next line was previously indented although it belonged to the for loop
366         //           in case of unexpected behaviour, enclose these statements in {}
367         diff = (itGaps->gapPos - itGaps->seqPos);
368     }
369     for(;seqContigIt != seqContigItEnd; ++seqContigIt)
370         appendValue(gappedConsensus, *seqContigIt, Generous());
371 }
372 
373 
374 //////////////////////////////////////////////////////////////////////////////
375 
376 template<typename TSpec, typename TConfig, typename TGappedConsensus, typename TSize>
377 inline void
assignGappedConsensus(FragmentStore<TSpec,TConfig> & fragStore,TGappedConsensus & gappedCons,TSize contigId)378 assignGappedConsensus(FragmentStore<TSpec, TConfig>& fragStore,
379                       TGappedConsensus& gappedCons,
380                       TSize contigId)
381 {
382     typedef FragmentStore<TSpec, TConfig> TFragmentStore;
383     typedef typename Value<TGappedConsensus>::Type TValue;
384     TValue gapChar = gapValue<TValue>();
385 
386     // Update the contig
387     typedef typename Value<typename TFragmentStore::TContigStore>::Type TContigStoreElement;
388     TContigStoreElement& contigEl = fragStore.contigStore[contigId];
389     clear(contigEl.gaps);
390     clear(contigEl.seq);
391 
392     // Create the sequence and the gap anchors
393     typedef typename Iterator<TGappedConsensus, Standard>::Type TStringIter;
394     TStringIter seqIt = begin(gappedCons, Standard());
395     TStringIter seqItEnd = end(gappedCons, Standard());
396     typedef typename TFragmentStore::TReadPos TReadPos;
397     typedef typename TFragmentStore::TContigGapAnchor TContigGapAnchor;
398     TReadPos ungappedPos = 0;
399     TReadPos gappedPos = 0;
400     bool gapOpen = false;
401     for(;seqIt != seqItEnd; ++seqIt, ++gappedPos) {
402         if (*seqIt == gapChar) gapOpen = true;
403         else {
404             if (gapOpen) {
405                 appendValue(contigEl.gaps, TContigGapAnchor(ungappedPos, gappedPos), Generous());
406                 gapOpen = false;
407             }
408             Dna5Q letter = *seqIt;
409             assignQualityValue(letter, 'D');
410             appendValue(contigEl.seq, letter);
411             ++ungappedPos;
412         }
413     }
414     if (gapOpen)
415         appendValue(contigEl.gaps, TContigGapAnchor(ungappedPos, gappedPos), Generous());
416 }
417 
418 
419 
420 //////////////////////////////////////////////////////////////////////////////////
421 
422 /*
423  * @fn consensusAlignment
424  * @headerfile <seqan/consensus.h>
425  * @brief Compute consensus alignment.
426  *
427  * @signature void consensusAlignment(alignmentGraph, beginEndPos[, options])
428  *
429  * @param[out] alignmentGraph  Alignment graph to build.
430  * @param[in]  options         Optional settings for the consenus alignment, type: <tt>ConsensusOptions</tt>.
431  * @param[in]  beginEndPos     Interval start and end position for the read's alignment; <tt>String&lt;Pair&lt;TPos, TPos&gt; &gt;</tt>.
432  *
433  * @section Example
434  *
435  * @code{.cpp}
436  * #include <seqan/sequence.h>
437  * #include <seqan/graph_align.h>
438  * #include <seqan/consensus.h>
439  *
440  * int main()
441  * {
442  *     using namespace seqan;
443  *
444  *     typedef StringSet<Dna5String> TStringSet;
445  *     typedef Graph<Alignment<TStringSet, void, WithoutEdgeId> > TAlignGraph;
446  *
447  *     TStringSet readSet;
448  *     String<Pair<TSize> > begEndPos;
449  *
450  *     appendValue(readSet, "CCCAGTGA");
451  *     appendValue(begEndPos, Pair<TSize>(0, 5));
452  *     appendValue(readSet, "AGGGACTGT");
453  *     appendValue(begEndPos, Pair<TSize>(3, 9));
454  *
455  *     TAlignGraph alignmentGraph(readSet);
456  *     consensusAlignment(alignmentGraph, begEndPos);
457  *
458  *     return 0;
459  * }
460  * @endcode
461  */
462 
463 template<typename TStringSet, typename TCargo, typename TSpec, typename TSize, typename TConfigOptions>
464 inline void
consensusAlignment(Graph<Alignment<TStringSet,TCargo,TSpec>> & gOut,String<Pair<TSize,TSize>> & begEndPos,TConfigOptions const & consOpt)465 consensusAlignment(Graph<Alignment<TStringSet, TCargo, TSpec> >& gOut,
466                    String<Pair<TSize, TSize> >& begEndPos,
467                    TConfigOptions const& consOpt)
468 {
469     typedef Graph<Alignment<TStringSet, TCargo, TSpec> > TOutGraph;
470     typedef typename Id<TOutGraph>::Type TId;
471 
472     // Initialization
473     TStringSet& seqSet = stringSet(gOut);
474 
475     // Select all overlapping reads and record the diagonals of the band
476     String<Pair<TId, TId> > pList;
477     String<Pair<int, int> > diagList;
478     if (consOpt.window == 0)
479         selectPairsAssembly(seqSet, begEndPos, consOpt.bandwidth, pList, diagList);
480     else
481         selectPairsAllAgainstAll(seqSet, begEndPos, consOpt.window, pList, diagList);
482 
483     // Set-up a sparse distance matrix
484     Graph<Undirected<double> > pairGraph;
485 
486     // Containers for segment matches and corresponding scores
487     typedef String<Fragment<> > TFragmentString;
488     TFragmentString matches;
489     typedef String<int> TScoreValues;
490     TScoreValues scores;
491 
492     // Compute segment matches from global pairwise alignments
493     appendSegmentMatches(seqSet, pList, diagList, begEndPos, consOpt.sc, consOpt.matchlength, consOpt.quality, consOpt.overlaps, matches, scores, pairGraph, OverlapLibrary() );
494     clear(pList);
495     clear(diagList);
496 
497     // If there are no alignment matches, return
498     if (!length(matches)) return;
499 
500     // Use these segment matches for the initial alignment graph
501     typedef Graph<Alignment<TStringSet, TSize> > TGraph;
502     TGraph g(seqSet);
503     buildAlignmentGraph(matches, scores, g, consOpt.sc, ReScore() );
504     clear(matches);
505     clear(scores);
506 
507     // Guide Tree
508     Graph<Tree<double> > guideTree;
509     upgmaTree(pairGraph, guideTree);
510     clear(pairGraph);
511 
512     // Triplet library extension
513     graphBasedTripletLibraryExtension(g);
514 
515     // Perform a progressive alignment
516     progressiveAlignment(g, guideTree, gOut);
517     clear(g);
518     clear(guideTree);
519 }
520 
521 //////////////////////////////////////////////////////////////////////////////////
522 
523 
524 template<typename TStringSet, typename TCargo, typename TSpec, typename TSize>
525 inline void
consensusAlignment(Graph<Alignment<TStringSet,TCargo,TSpec>> & gOut,String<Pair<TSize,TSize>> & begEndPos)526 consensusAlignment(Graph<Alignment<TStringSet, TCargo, TSpec> >& gOut,
527                    String<Pair<TSize, TSize> >& begEndPos)
528 {
529     ConsensusOptions consOpt;
530     consensusAlignment(gOut, begEndPos, consOpt);
531 }
532 
533 
534 //////////////////////////////////////////////////////////////////////////////
535 
536 template <typename TFragSpec, typename TConfig, typename TStringSet, typename TCargo, typename TSpec, typename TContigId>
537 inline void
updateContig(FragmentStore<TFragSpec,TConfig> & fragStore,Graph<Alignment<TStringSet,TCargo,TSpec>> const & g,TContigId contigId)538 updateContig(FragmentStore<TFragSpec, TConfig>& fragStore,
539              Graph<Alignment<TStringSet, TCargo, TSpec> > const& g,
540              TContigId contigId)
541 {
542     typedef Graph<Alignment<TStringSet, TCargo, TSpec> > TGraph;
543     typedef typename Size<TGraph>::Type TSize;
544     typedef typename Value<TStringSet>::Type TString;
545     typedef typename VertexDescriptor<TGraph>::Type TVertexDescriptor;
546     typedef std::map<TSize, TSize> TComponentLength;
547     typedef char TValue;
548 
549     // Initialization
550     TStringSet& strSet = stringSet(g);
551     TSize nseq = length(strSet);
552     TValue gapChar = gapValue<TValue>();
553     TValue specialGap = '.';
554     TSize maxCoverage = 0;
555     TSize len = 0;
556     String<TValue> mat;
557 
558     // Store for each read the begin position, the end position and the row in the alignment matrix
559     String<TSize> readBegEndRowPos;
560     resize(readBegEndRowPos, 3*nseq);
561 
562     // Strongly Connected Components, topological sort, and length of each component
563     String<TSize> component;
564     String<TSize> order;
565     TComponentLength compLength;
566     if (convertAlignment(g, component, order, compLength)) {
567         TSize numOfComponents = length(order);
568 
569         // Assign to each sequence the start and end (in terms of component ranks)
570         typedef String<std::pair<TSize, TSize> > TComponentToRank;
571         TComponentToRank compToRank;
572         for(TSize compIndex = 0; compIndex < numOfComponents; ++compIndex)
573             appendValue(compToRank, std::make_pair(order[compIndex], compIndex), Generous());
574         std::sort(begin(compToRank, Standard()), end(compToRank, Standard()));
575 
576         typedef Pair<TSize, TSize> TRankPair;
577         typedef String<TRankPair> TSequenceToRanks;
578         TSequenceToRanks seqToRank;
579         resize(seqToRank, nseq);
580         typedef typename Iterator<TGraph, VertexIterator>::Type TVertexIterator;
581         TVertexIterator itVertex(g);
582         for(;!atEnd(itVertex);++itVertex) {
583             TVertexDescriptor vert = value(itVertex);
584             TSize seq = idToPosition(strSet, sequenceId(g, vert));
585             if (fragmentBegin(g, vert) == 0)
586                 seqToRank[seq].i1 = std::lower_bound(begin(compToRank, Standard()), end(compToRank, Standard()), std::make_pair((TSize) component[vert], (TSize) 0))->second;
587             if (fragmentBegin(g, vert) + fragmentLength(g, vert) == length(strSet[seq]))
588                 seqToRank[seq].i2 = std::lower_bound(begin(compToRank, Standard()), end(compToRank, Standard()), std::make_pair((TSize) component[vert], (TSize) 0))->second;
589         }
590         clear(compToRank);
591 
592         // Assign the sequences to rows
593         String<TSize> seqToRow;
594         resize(seqToRow, nseq);
595         maxCoverage = 0;
596         typedef String<bool> TLeftOver;
597         typedef typename Iterator<TLeftOver, Standard>::Type TLeftOverIter;
598         TLeftOver leftOver;
599         resize(leftOver, nseq, true);
600         typedef String<std::pair<TSize, TSize> > TSeqToBegin;
601         typedef typename Iterator<TSeqToBegin, Standard>::Type TSeqToBeginIter;
602         TSeqToBegin seqToBegin;
603         TSize finishedSeq = 0;
604         while(finishedSeq < nseq) {
605             TLeftOverIter itL = begin(leftOver, Standard());
606             TLeftOverIter itLEnd = end(leftOver, Standard());
607             for(TSize pos = 0; itL != itLEnd; ++itL, ++pos)
608                 if (*itL) appendValue(seqToBegin, std::make_pair((seqToRank[pos]).i1, pos), Generous());
609             std::sort(begin(seqToBegin, Standard()), end(seqToBegin, Standard()));
610 
611             TSize endPos = 0;
612             TSeqToBeginIter itSB = begin(seqToBegin, Standard());
613             TSeqToBeginIter itSBEnd = end(seqToBegin, Standard());
614             for(;itSB != itSBEnd;++itSB) {
615                 if (endPos <= (*itSB).first) {
616                     TSize currentSeq = (*itSB).second;
617                     seqToRow[currentSeq] = maxCoverage;
618                     endPos = (seqToRank[currentSeq]).i2 + 2;
619                     leftOver[currentSeq] = false;
620                     ++finishedSeq;
621                 }
622             }
623             clear(seqToBegin);
624             ++maxCoverage;
625         }
626         clear(leftOver);
627 
628         // Create the matrix
629         len = 0;
630         String<TSize> compOffset;
631         resize(compOffset, numOfComponents);
632         for(TSize compIndex = 0; compIndex < numOfComponents; ++compIndex) {
633             compOffset[order[compIndex]] = len;
634             len+=compLength[order[compIndex]];
635         }
636         resize(mat, len * maxCoverage, gapChar);
637 
638         // Fill in the segments
639         typedef typename Infix<TString>::Type TInfix;
640         typedef typename Iterator<TInfix, Standard>::Type TInfixIter;
641         typedef typename TGraph::TPosToVertexMap_ TPosToVertexMap;
642         for(typename TPosToVertexMap::const_iterator it = g.data_pvMap.begin();it != g.data_pvMap.end(); ++it) {
643             TInfix str = label(g,it->second);
644             TSize c = property(component, it->second);
645             TSize row = seqToRow[idToPosition(strSet, it->first.first)];
646             //if (row == 0) {
647             //    std::cout << sequenceId(g, it->second) << ':' << str << ',' << strSet[sequenceId(g, it->second)] << std::endl;
648             //    std::cout << getProperty(component, it->second) << ',' << order[compIndex] << std::endl;
649             //    std::cout << (seqToRank[sequenceId(g, it->second)]).i1 << ',' << (seqToRank[sequenceId(g, it->second)]).i2 << std::endl;
650             //}
651             TInfixIter sIt = begin(str, Standard());
652             TInfixIter sItEnd = end(str, Standard());
653             TSize i = compOffset[c];
654             for(TSize pCol = i;sIt!=sItEnd;++sIt, ++pCol, ++i)
655                 mat[row * len + pCol] = *sIt;
656         }
657         String<bool> active;
658         for(TSize compIndex = 0; compIndex < numOfComponents; ++compIndex) {
659             TSize offset = compOffset[order[compIndex]];
660             TSize currentCompLength = compLength[order[compIndex]];
661 
662             clear(active);
663             resize(active, maxCoverage, false);
664 
665             // Find the empty rows
666             for(TSize i=0;i<nseq; ++i) {
667                 if (((seqToRank[i]).i1 <= compIndex) && ((seqToRank[i]).i2 >= compIndex))
668                     active[(seqToRow[i])] = true;
669             }
670 
671             // Substitute false gaps with special gap character
672             for(TSize i = 0; i < maxCoverage; ++i) {
673                 if (!(active[i])) {
674                     for(TSize pCol = offset;pCol < offset + currentCompLength;++pCol)
675                         mat[i * len + pCol] = specialGap;
676                 }
677             }
678         }
679 
680         // Get the new begin and end positions
681         for(TSize i=0;i<nseq; ++i) {
682             TVertexDescriptor lastVertex = findVertex(const_cast<TGraph&>(g), positionToId(strSet, i), length(strSet[i]) - 1);
683             TSize readBegin = compOffset[getProperty(component, findVertex(const_cast<TGraph&>(g), positionToId(strSet, i), 0))];
684             TSize readEnd = compOffset[getProperty(component, lastVertex)] + fragmentLength(const_cast<TGraph&>(g), lastVertex);
685             readBegEndRowPos[3*i] = readBegin;
686             readBegEndRowPos[3*i+1] = readEnd;
687             readBegEndRowPos[3*i+2] = seqToRow[i];
688         }
689 
690     }
691     clear(component);
692     clear(order);
693     compLength.clear();
694 
695 
696     //// Debug code
697     //for(TSize row = 0; row<maxCoverage; ++row) {
698     //    for(TSize col = 0; col<len; ++col) {
699     //        std::cout << mat[row * len + col];
700     //    }
701     //    std::cout << std::endl;
702     //}
703 
704     // Create the new consensus
705     typedef typename Value<TString>::Type TAlphabet;
706     String<TValue> gappedCons;
707     consensusCalling(mat, gappedCons, maxCoverage, TAlphabet(), MajorityVote());
708 
709     // Assign new consensus
710     assignGappedConsensus(fragStore, gappedCons, contigId);
711 
712     // Update all aligned reads
713     typedef FragmentStore<TSpec, TConfig> TFragmentStore;
714     typedef typename TFragmentStore::TReadPos TReadPos;
715     typedef typename TFragmentStore::TContigPos TContigPos;
716     typedef typename TFragmentStore::TContigGapAnchor TContigGapAnchor;
717     //typedef typename Value<typename TFragmentStore::TAlignedReadStore>::Type TAlignedElement;
718     typedef typename Iterator<typename TFragmentStore::TAlignedReadStore>::Type TAlignIter;
719     sortAlignedReads(fragStore.alignedReadStore, SortContigId());
720     TAlignIter alignIt = lowerBoundAlignedReads(fragStore.alignedReadStore, contigId, SortContigId());
721     TAlignIter alignItEnd = upperBoundAlignedReads(fragStore.alignedReadStore, contigId, SortContigId());
722     TReadPos ungappedPos = 0;
723     TReadPos gappedPos = 0;
724     bool gapOpen;
725     for(TSize i = 0;alignIt != alignItEnd; ++alignIt, ++i) {
726         TSize lenRead = length(fragStore.readSeqStore[alignIt->readId]);
727         TReadPos begClr = 0;
728         TReadPos endClr = 0;
729         getClrRange(fragStore, *alignIt, begClr, endClr);
730         clear(alignIt->gaps);
731         ungappedPos = begClr;
732         if (alignIt->beginPos > alignIt->endPos) ungappedPos = lenRead - endClr;
733         if (ungappedPos != 0) appendValue(alignIt->gaps, TContigGapAnchor(ungappedPos, 0));
734         gappedPos = 0;
735         gapOpen = false;
736         for(TSize column = readBegEndRowPos[3*i]; column<readBegEndRowPos[3*i + 1]; ++column, ++gappedPos) {
737             if (mat[readBegEndRowPos[3*i + 2] * len + column] == gapChar) gapOpen = true;
738             else {
739                 if (gapOpen) {
740                     appendValue(alignIt->gaps, TContigGapAnchor(ungappedPos, gappedPos), Generous());
741                     gapOpen = false;
742                 }
743                 ++ungappedPos;
744             }
745         }
746         if (gapOpen) appendValue(alignIt->gaps, TContigGapAnchor(ungappedPos, gappedPos), Generous());
747         if (alignIt->beginPos < alignIt->endPos) {
748             if (endClr != (TContigPos)lenRead)
749                 appendValue(alignIt->gaps, TContigGapAnchor(lenRead, lenRead + (gappedPos - ungappedPos) - (lenRead - endClr)), Generous());
750         } else {
751             if (begClr != 0)
752                 appendValue(alignIt->gaps, TContigGapAnchor(lenRead, lenRead + (gappedPos - ungappedPos) - begClr), Generous());
753         }
754 
755         // Set new begin and end position
756         if (alignIt->beginPos < alignIt->endPos) {
757             alignIt->beginPos = readBegEndRowPos[3*i];
758             alignIt->endPos = readBegEndRowPos[3*i+1];
759         } else {
760             alignIt->beginPos = readBegEndRowPos[3*i+1];
761             alignIt->endPos = readBegEndRowPos[3*i];
762         }
763     }
764 }
765 
766 
767 //////////////////////////////////////////////////////////////////////////////
768 
769 template <typename TValue, typename TSpec, typename TCounters, typename TSize, typename TAlphabet>
770 inline void
_countLetters(String<TValue,TSpec> const & mat,TCounters & counterValues,TSize alignDepth,TAlphabet)771 _countLetters(String<TValue, TSpec> const & mat,
772               TCounters & counterValues,
773               TSize alignDepth,
774               TAlphabet)
775 {
776     typedef String<TValue, TSpec> const TMatrix;
777     typedef typename Iterator<TMatrix, Standard>::Type TMatIter;
778 
779     // Initialization
780     TSize len = length(mat) / alignDepth;
781     TValue gapChar = gapValue<TValue>();
782     TValue specialGap = '.';
783     TSize alphabetSize = ValueSize<TAlphabet>::VALUE;
784 
785     // Set-up counter values
786     typedef typename Value<TCounters>::Type TCounter;
787     typedef typename Iterator<TCounters, Standard>::Type TCounterIt;
788     resize(counterValues, len);
789     for(TSize i=0;i<len; ++i) {
790         TCounter counter;
791         resize(counter, alphabetSize + 1, 0);
792         counterValues[i] = counter;
793     }
794 
795     // Count all
796     TMatIter matIt = begin(mat, Standard());
797     TMatIter matItEnd = end(mat, Standard());
798     TCounterIt countIt = begin(counterValues, Standard());
799     TSize pos = 0;
800     for(; matIt != matItEnd; ++matIt, ++countIt, ++pos) {
801         if (pos % len == 0) countIt = begin(counterValues, Standard());
802         if (*matIt != specialGap) {
803             if (*matIt == gapChar) ++value(*countIt, alphabetSize);
804             else ++value(*countIt, ordValue((TAlphabet) *matIt));
805         }
806     }
807 }
808 
809 
810 //////////////////////////////////////////////////////////////////////////////
811 
812 template <typename TValue, typename TSpec, typename TGappedCons, typename TAlignDepth, typename TAlphabet>
813 inline void
consensusCalling(String<TValue,TSpec> const & mat,TGappedCons & gappedConsensus,TAlignDepth maxCoverage,TAlphabet,Bayesian)814 consensusCalling(String<TValue, TSpec> const& mat,
815                  TGappedCons& gappedConsensus,
816                  TAlignDepth maxCoverage,
817                  TAlphabet,
818                  Bayesian)
819 {
820     typedef double TProbability;
821     typedef String<TProbability> TProbabilityDistribution;
822     typedef String<TProbabilityDistribution> TPositionalPrDist;
823     typedef typename Iterator<TPositionalPrDist, Standard>::Type TPosPrDistIter;
824 
825     typedef typename Size<String<TValue, TSpec> >::Type TSize;
826     TSize alphabetSize = ValueSize<TAlphabet>::VALUE;
827     TValue gapChar = gapValue<TValue>();
828     TValue specialGap = '.';
829 
830     // Set-up the counters
831     typedef String<TSize> TCounter;
832     typedef String<TCounter> TCounters;
833     TCounters counterValues;
834     _countLetters(mat, counterValues, maxCoverage, TAlphabet() );
835 
836 
837     // Initialization
838     TSize len = length(mat) / maxCoverage;
839     TProbabilityDistribution backroundDist;
840     resize(backroundDist, alphabetSize + 1, ((TProbability) 1 / (TProbability) (alphabetSize + 1)));
841 
842     // Get an initial consensus
843     typedef typename Iterator<TCounters, Standard>::Type TCounterIt;
844     TCounterIt countIt = begin(counterValues, Standard());
845     TCounterIt countItEnd = end(counterValues, Standard());
846     TPositionalPrDist posPrDist;
847     TValue c = TAlphabet();
848     for(;countIt != countItEnd; ++countIt) {
849         TSize max = 0;
850         typedef typename Iterator<TCounter, Standard>::Type TCIt;
851         TCIt cIt = begin(*countIt, Standard());
852         TCIt cItEnd = end(*countIt, Standard());
853         TSize pos = 0;
854         for(;cIt != cItEnd; ++cIt, ++pos) {
855             if (*cIt > max) {
856                 max = *cIt;
857                 c = (pos == alphabetSize) ? gapChar : (TValue) TAlphabet(pos);
858             }
859         }
860         TProbabilityDistribution prDist;
861         resize(prDist, alphabetSize + 1, 0);
862         if (c == gapChar) prDist[alphabetSize] = 1;
863         else prDist[ordValue((TAlphabet) c)] = 1;
864         appendValue(posPrDist, prDist, Generous());
865     }
866 
867     TSize run = 1;
868     TProbabilityDistribution pI;
869     TProbabilityDistribution pIJ;
870     TProbabilityDistribution pIOld;
871     TProbabilityDistribution pIJOld;
872     while (run) {
873         // Store the values from the last iteration
874         pIOld = pI;
875         pIJOld = pIJ;
876 
877         // Count all letters in the consensus
878         TProbabilityDistribution nI;
879         resize(nI, alphabetSize + 1, 0);
880         TPosPrDistIter itPosPrDist = begin(posPrDist, Standard());
881         TPosPrDistIter itPosPrDistEnd = end(posPrDist, Standard());
882         for(;itPosPrDist!=itPosPrDistEnd; ++itPosPrDist)
883             for(TSize i = 0; i<(alphabetSize + 1); ++i)
884                 nI[i] += (*itPosPrDist)[i];
885 
886         // Composition probabilities
887         clear(pI);
888         resize(pI, alphabetSize + 1);
889         TProbability lenPosPrDist = (TProbability) length(posPrDist);
890         for(TSize i = 0; i<length(pI); ++i)
891             pI[i] = nI[i] / lenPosPrDist;
892 
893 
894         // Count all letters that agree / disagree with the consensus
895         TProbabilityDistribution nIJ;
896         resize(nIJ, (alphabetSize + 1) * (alphabetSize + 1), 0);
897         typedef String<TValue, TSpec> TMatrix;
898         typedef typename Iterator<TMatrix, Standard>::Type TMatIter;
899         TMatIter matIt = begin(mat, Standard());
900         TMatIter matItEnd = end(mat, Standard());
901         itPosPrDist = begin(posPrDist, Standard());
902         TSize pos = 0;
903         for(; matIt != matItEnd; ++matIt, ++itPosPrDist, ++pos) {
904             if (pos % len == 0) itPosPrDist = begin(posPrDist, Standard());
905             TValue c = *matIt;
906             if (c != specialGap) {
907                 TSize fragJ = (c != gapChar) ? ordValue(TAlphabet(c)) : alphabetSize;
908                 for(TSize consI = 0; consI<(alphabetSize + 1); ++consI)
909                     nIJ[consI * (alphabetSize + 1) + fragJ] += (*itPosPrDist)[consI];
910             }
911         }
912 
913         // Sequencing error probabilities
914         clear(pIJ);
915         resize(pIJ, (alphabetSize + 1) * (alphabetSize + 1));
916         TProbability sumIJ = 0;
917         for(TSize diag = 0; diag<(alphabetSize + 1); ++diag) sumIJ += nIJ[diag * (alphabetSize + 1) + diag];
918         for(TSize consI = 0; consI<(alphabetSize + 1); ++consI)
919             for(TSize fragJ = 0; fragJ<(alphabetSize + 1); ++fragJ)
920                 pIJ[consI * (alphabetSize + 1) + fragJ] = nIJ[consI * (alphabetSize + 1) + fragJ] / sumIJ;
921 
922         // Recompute positional probability distribution
923         itPosPrDist = begin(posPrDist, Standard());
924         TSize col = 0;
925         for(;itPosPrDist!=itPosPrDistEnd; ++itPosPrDist, ++col) {
926             TProbabilityDistribution prDist;
927             resize(prDist, alphabetSize + 1);
928             for(TSize consI = 0; consI<(alphabetSize + 1); ++consI) {
929                 TProbability numerator = pI[consI];
930                 TProbability denominator = 0;
931                 for(TSize allI = 0; allI<(alphabetSize + 1); ++allI) {
932                     TProbability denominatorSub = value(pI, allI);
933                     for(TSize row = 0; row < maxCoverage; ++row) {
934                         TValue c = mat[row * len + col];
935                         if (c != specialGap) {
936                             TSize fragJ = (c != gapChar) ? ordValue(TAlphabet(c)) : alphabetSize;
937                             if (allI == consI)
938                                 numerator *= pIJ[allI * (alphabetSize + 1) + fragJ];
939                             denominatorSub *= pIJ[allI * (alphabetSize + 1) + fragJ];
940                         }
941                     }
942                     denominator += denominatorSub;
943                 }
944                 prDist[consI] = numerator / denominator;
945             }
946             *itPosPrDist = prDist;
947         }
948 
949         // Check termination criterion
950         TProbability eps = 0.00001;
951         typedef typename Iterator<TProbabilityDistribution, Standard>::Type TProbIter;
952         TProbIter pIter = begin(pIOld, Standard());
953         TProbIter pIterCompare = begin(pI, Standard());
954         TProbIter pIterEnd = end(pIOld, Standard());
955         TSize runOld = run;
956         for(;pIter != pIterEnd; ++pIter, ++pIterCompare) {
957             if (*pIter > *pIterCompare) {
958                 if (*pIter - *pIterCompare > eps) {
959                     ++run;
960                     break;
961                 }
962             } else {
963                 if (*pIterCompare - *pIter > eps) {
964                     ++run;
965                     break;
966                 }
967             }
968         }
969         if (runOld == run) {
970             pIter = begin(pIJOld, Standard());
971             pIterCompare = begin(pIJ, Standard());
972             pIterEnd = end(pIJOld, Standard());
973             for(;pIter != pIterEnd; ++pIter, ++pIterCompare) {
974                 if (*pIter > *pIterCompare) {
975                     if (*pIter - *pIterCompare > eps) {
976                         ++run;
977                         break;
978                     }
979                 } else {
980                     if (*pIterCompare - *pIter > eps) {
981                         ++run;
982                         break;
983                     }
984                 }
985             }
986         }
987 
988         if (runOld == run) {
989             std::cout << "Iterations: " << run << std::endl;
990             run = 0;
991         }
992     }
993 
994     // Compute the most likely consensus
995     TPosPrDistIter itPosPrDist = begin(posPrDist, Standard());
996     TPosPrDistIter itPosPrDistEnd = end(posPrDist, Standard());
997     clear(gappedConsensus);
998     for(;itPosPrDist!=itPosPrDistEnd; ++itPosPrDist) {
999         TProbability max = 0;
1000         TSize ind = 0;
1001         for(TSize consI = 0; consI<(alphabetSize + 1); ++consI) {
1002             if ((*itPosPrDist)[consI] > max) {
1003                 max = (*itPosPrDist)[consI];
1004                 ind = consI;
1005             }
1006         }
1007         if (ind == alphabetSize) appendValue(gappedConsensus, gapChar);
1008         else appendValue(gappedConsensus, TAlphabet(ind));
1009     }
1010 
1011 }
1012 
1013 //////////////////////////////////////////////////////////////////////////////
1014 
1015 
1016 template <typename TFragSpec, typename TConfig, typename TContigId>
1017 inline void
consensusCalling(FragmentStore<TFragSpec,TConfig> & fragStore,TContigId contigId,Bayesian)1018 consensusCalling(FragmentStore<TFragSpec, TConfig>& fragStore,
1019                  TContigId contigId,
1020                  Bayesian)
1021 {
1022 
1023     typedef FragmentStore<TFragSpec, TConfig> TFragmentStore;
1024     typedef typename Size<TFragmentStore>::Type TSize;
1025     typedef typename TFragmentStore::TReadSeq TReadSeq;
1026     typedef typename Value<TReadSeq>::Type TAlphabet;
1027     typedef char TValue;
1028 
1029     // Convert the contig to an alignment matrix
1030     typedef String<TValue> TAlignMat;
1031     TAlignMat mat;
1032     TSize maxCoverage;
1033     convertAlignment(fragStore, mat, contigId, maxCoverage);
1034 
1035     // Call the consensus
1036     String<TValue> gappedConsensus;
1037     consensusCalling(mat, gappedConsensus, maxCoverage, TAlphabet(), Bayesian());
1038 
1039     // Assign the new consensus
1040     assignGappedConsensus(fragStore, gappedConsensus, contigId);
1041 }
1042 
1043 //////////////////////////////////////////////////////////////////////////////
1044 
1045 template <typename TValue, typename TSpec, typename TGappedCons, typename TAlignDepth, typename TAlphabet>
1046 inline void
consensusCalling(String<TValue,TSpec> const & mat,TGappedCons & gappedConsensus,TAlignDepth maxCoverage,TAlphabet,MajorityVote)1047 consensusCalling(String<TValue, TSpec> const& mat,
1048                  TGappedCons& gappedConsensus,
1049                  TAlignDepth maxCoverage,
1050                  TAlphabet,
1051                  MajorityVote)
1052 {
1053     typedef typename Size<String<TValue, TSpec> >::Type TSize;
1054     TSize alphabetSize = ValueSize<TAlphabet>::VALUE;
1055     TValue gapChar = gapValue<TValue>();
1056 
1057     // Set-up the counters
1058     typedef String<TSize> TCounter;
1059     typedef String<TCounter> TCounters;
1060     TCounters counterValues;
1061     _countLetters(mat, counterValues, maxCoverage, TAlphabet() );
1062 
1063     // Get the consensus
1064     typedef typename Iterator<TCounters, Standard>::Type TCounterIt;
1065     TCounterIt countIt = begin(counterValues, Standard());
1066     TCounterIt countItEnd = end(counterValues, Standard());
1067     clear(gappedConsensus);
1068     TSize max = 0;
1069     TValue c = TValue();
1070     TSize pos = 0;
1071     for(;countIt != countItEnd; ++countIt) {
1072         max = 0;
1073         typedef typename Iterator<TCounter, Standard>::Type TCIt;
1074         TCIt cIt = begin(*countIt, Standard());
1075         TCIt cItEnd = end(*countIt, Standard());
1076         pos = 0;
1077         for(;cIt != cItEnd; ++cIt, ++pos) {
1078             if (*cIt > max) {
1079                 max = *cIt;
1080                 c = (pos == alphabetSize) ? gapChar : (TValue) TAlphabet(pos);
1081             }
1082         }
1083         appendValue(gappedConsensus, c);
1084     }
1085 }
1086 
1087 
1088 //////////////////////////////////////////////////////////////////////////////
1089 
1090 
1091 template <typename TFragSpec, typename TConfig, typename TContigId>
1092 inline void
consensusCalling(FragmentStore<TFragSpec,TConfig> & fragStore,TContigId contigId,MajorityVote)1093 consensusCalling(FragmentStore<TFragSpec, TConfig>& fragStore,
1094                  TContigId contigId,
1095                  MajorityVote)
1096 {
1097     typedef FragmentStore<TFragSpec, TConfig> TFragmentStore;
1098     typedef typename Size<TFragmentStore>::Type TSize;
1099     typedef typename TFragmentStore::TReadSeq TReadSeq;
1100     typedef typename Value<TReadSeq>::Type TAlphabet;
1101     typedef char TValue;
1102 
1103     // Convert the contig to an alignment matrix
1104     typedef String<TValue> TAlignMat;
1105     TAlignMat mat;
1106     TSize maxCoverage;
1107     convertAlignment(fragStore, mat, contigId, maxCoverage);
1108 
1109     // Call the consensus
1110     String<TValue> gappedConsensus;
1111     consensusCalling(mat, gappedConsensus, maxCoverage, TAlphabet(), MajorityVote());
1112 
1113     // Assign the new consensus
1114     assignGappedConsensus(fragStore, gappedConsensus, contigId);
1115 }
1116 
1117 
1118 //////////////////////////////////////////////////////////////////////////////
1119 
1120 
1121 //////////////////////////////////////////////////////////////////////////////
1122 // Old proprietary FastaReadFormat
1123 //////////////////////////////////////////////////////////////////////////////
1124 
1125 //////////////////////////////////////////////////////////////////////////////
1126 
1127 struct FastaReadFormat_;
1128 typedef Tag<FastaReadFormat_> const FastaReadFormat;
1129 
1130 //////////////////////////////////////////////////////////////////////////////
1131 
1132 template<typename TFile, typename TSpec, typename TConfig>
1133 inline void
write(TFile & file,FragmentStore<TSpec,TConfig> & fragStore,FastaReadFormat)1134 write(
1135     TFile & file,
1136     FragmentStore<TSpec, TConfig>& fragStore,
1137     FastaReadFormat)
1138 {
1139 //IOREV _nodoc_ what has this got to do with consensus? why is it here?
1140     // Basic types
1141     typedef FragmentStore<TSpec, TConfig> TFragmentStore;
1142     typedef typename Size<TFragmentStore>::Type TSize;
1143     typedef typename TFragmentStore::TContigPos TContigPos;
1144     //typedef typename Value<TFile>::Type TValue;
1145     typedef char TMultiReadChar;
1146     TMultiReadChar gapChar = gapValue<TMultiReadChar>();
1147 
1148     typename DirectionIterator<TFile, Output>::Type target = directionIterator(file, Output());
1149 
1150     typedef typename Iterator<typename TFragmentStore::TContigStore, Standard>::Type TContigIter;
1151     TContigIter contigIt = begin(fragStore.contigStore, Standard() );
1152     TContigIter contigItEnd = end(fragStore.contigStore, Standard() );
1153     for(TSize idCount = 0;contigIt != contigItEnd; ++contigIt, ++idCount) {
1154         // Alignment matrix
1155         typedef String<TMultiReadChar> TAlignMat;
1156         TAlignMat mat;
1157         TSize maxCoverage;
1158         String<TSize> readSlot;
1159         convertAlignment(fragStore, mat, idCount, maxCoverage, readSlot);
1160         TSize len = length(mat) / maxCoverage;
1161 
1162         // Gapped consensus sequence
1163         typedef String<TMultiReadChar> TGappedConsensus;
1164         TGappedConsensus gappedConsensus;
1165         getGappedConsensus(fragStore, gappedConsensus, idCount);
1166 
1167         // Print the alignment matrix
1168         String<TSize> coverage;
1169         resize(coverage, len, 0);
1170         typedef typename Iterator<TGappedConsensus, Standard>::Type TConsIter;
1171         TConsIter itCons = begin(gappedConsensus, Standard());
1172         TSize winSize = 60;
1173         int offset = 2;
1174         TSize column = 0;
1175         while (column<len) {
1176             TSize window_end = column + winSize;
1177             if (window_end >= len) window_end = len;
1178             // Position
1179             for(int i = 0; i<offset - 2; ++i) writeValue(target, ' ');
1180             write(target, "Pos: ");
1181             appendNumber(target, column);
1182             writeValue(target, '\n');
1183             // Ruler
1184             for(int i = 0; i<offset + 3; ++i) writeValue(target, ' ');
1185             for(TSize local_col = 1; local_col<window_end - column + 1; ++local_col) {
1186                 if ((local_col % 10)==0)
1187                     writeValue(target, ':');
1188                 else if ((local_col % 5)==0)
1189                     writeValue(target, '.');
1190                 else
1191                     writeValue(target, ' ');
1192             }
1193             writeValue(target, '\n');
1194             // Matrix
1195             for(TSize row = 0; row<maxCoverage; ++row) {
1196                 TSize tmp = row;
1197                 int off = 0;
1198                 while (tmp / 10 != 0) {
1199                     tmp /= 10;
1200                     ++off;
1201                 }
1202                 for(int i = 0; i<offset - off; ++i) writeValue(target, ' ');
1203                 appendNumber(target, row);
1204                 writeValue(target, ':');
1205                 writeValue(target, ' ');
1206                 for(TSize local_col = column; local_col<window_end; ++local_col) {
1207                     writeValue(target, mat[row * len + local_col]);
1208                     if (mat[row * len + local_col] != '.') ++coverage[local_col];
1209                 }
1210                 writeValue(target, '\n');
1211             }
1212             writeValue(target, '\n');
1213 
1214             // Consensus
1215             for(int i = 0; i<offset; ++i)
1216                 writeValue(target, ' ');
1217             write(target, "C: ");
1218             for(unsigned int local_col = column; local_col<window_end; ++local_col, ++itCons)
1219                 writeValue(target, *itCons);
1220             writeValue(target, '\n');
1221             for(int i = 0; i<offset-1; ++i)
1222                 writeValue(target, ' ');
1223             write(target, ">2: ");
1224             for(unsigned int local_col = column; local_col<window_end; ++local_col) {
1225                 if (coverage[local_col] > 2)
1226                     writeValue(target, gappedConsensus[local_col]);
1227                 else
1228                     writeValue(target, gapChar);
1229             }
1230             writeValue(target, '\n');
1231             writeValue(target, '\n');
1232             column+=winSize;
1233         }
1234         writeValue(target, '\n');
1235         writeValue(target, '\n');
1236 
1237         // Print all aligned reads belonging to this contig
1238 
1239         // Sort according to contigId
1240         sortAlignedReads(fragStore.alignedReadStore, SortContigId());
1241 
1242         // Find range of the given contig
1243         typedef typename Iterator<typename TFragmentStore::TAlignedReadStore, Standard>::Type TAlignIter;
1244         TAlignIter alignIt = lowerBoundAlignedReads(fragStore.alignedReadStore, idCount, SortContigId());
1245         TAlignIter alignItEnd = upperBoundAlignedReads(fragStore.alignedReadStore, idCount, SortContigId());
1246 
1247         // Sort the reads according to the begin position
1248         sortAlignedReads(infix(fragStore.alignedReadStore, alignIt - begin(fragStore.alignedReadStore, Standard()), alignItEnd - begin(fragStore.alignedReadStore, Standard())), SortBeginPos());
1249         TAlignIter alignItTmp = lowerBoundAlignedReads(fragStore.alignedReadStore, idCount, SortContigId());
1250         TAlignIter alignItTmpEnd = upperBoundAlignedReads(fragStore.alignedReadStore, idCount, SortContigId());
1251         String<std::pair<TSize, TSize> > idToPos;
1252         reserve(idToPos, alignItTmpEnd - alignItTmp);
1253         for(TSize iCount = 0; alignItTmp!=alignItTmpEnd; ++iCount, ++alignItTmp)
1254             appendValue(idToPos, std::make_pair(alignItTmp->id, readSlot[iCount]));
1255         std::sort(begin(idToPos, Standard()), end(idToPos, Standard()));
1256 
1257         // Sort the reads according to the id
1258         sortAlignedReads(infix(fragStore.alignedReadStore, alignIt - begin(fragStore.alignedReadStore, Standard()), alignItEnd - begin(fragStore.alignedReadStore, Standard())), SortId());
1259         alignIt = lowerBoundAlignedReads(fragStore.alignedReadStore, idCount, SortContigId());
1260         alignItEnd = upperBoundAlignedReads(fragStore.alignedReadStore, idCount, SortContigId());
1261 
1262         bool noNamesPresent = (length(fragStore.readNameStore) == 0);
1263         for(TSize iCount = 0;alignIt != alignItEnd; ++alignIt, ++iCount) {
1264 
1265             // Print all reads
1266             write(target, "typ:");
1267             if (!noNamesPresent)
1268             {
1269                 writeValue(target, 'R');
1270                 appendNumber(target, iCount);
1271             }
1272             else
1273             {
1274                 write(target, fragStore.readNameStore[alignIt->readId]);
1275             }
1276             write(target, "\nseq:");
1277             write(target, fragStore.readSeqStore[alignIt->readId]);
1278             write(target, "\nPos:");
1279             appendNumber(target, alignIt->beginPos);
1280             writeValue(target, ',');
1281             appendNumber(target, alignIt->endPos);
1282             writeValue(target, '\n');
1283 #ifndef CELERA_OFFSET
1284             TSize begClr = 0;
1285             TSize endClr = 0;
1286             getClrRange(fragStore, *alignIt, begClr, endClr);
1287             write(target, "clr:");
1288             appendNumber(target, begClr);
1289             writeValue(target, ',');
1290             appendNumber(target, endClr);
1291             writeValue(target, '\n');
1292 #endif
1293             std::stringstream gapCoords;
1294             TSize letterCount = 0;
1295             TSize gapCount = 0;
1296             for(TContigPos column = _min(alignIt->beginPos, alignIt->endPos); column < _max(alignIt->beginPos, alignIt->endPos); ++column) {
1297                 if (mat[idToPos[iCount].second * len + column] == gapChar) {
1298                     ++gapCount;
1299                     gapCoords << letterCount << ' ';
1300                 } else ++letterCount;
1301             }
1302             write(target, "dln:");
1303             appendNumber(target, gapCount);
1304             writeValue(target, '\n');
1305             write(target, "del:");
1306             write(target, gapCoords.str());
1307             writeValue(target, '\n');
1308             writeValue(target, '\n');
1309         }
1310     }
1311 }
1312 
1313 //////////////////////////////////////////////////////////////////////////////
1314 // Read simulator format: Simple fasta read file with positions
1315 //////////////////////////////////////////////////////////////////////////////
1316 
1317 
1318 //////////////////////////////////////////////////////////////////////////////////
1319 
1320 
1321 template<typename TFile, typename TSpec, typename TConfig, typename TFilePath>
1322 inline void
_convertSimpleReadFile(TFile &,FragmentStore<TSpec,TConfig> &,TFilePath &,bool)1323 _convertSimpleReadFile(TFile& /*file*/,
1324                        FragmentStore<TSpec, TConfig>& /*fragStore*/,
1325                        TFilePath& /*filePath*/,
1326                        bool /*moveToFront*/)
1327 {
1328 return;
1329 ////IOREV _nodoc_ huge undocumented function, uses custom IO based on iostream and FILE* :S
1330 //    // Basic types
1331 //    typedef FragmentStore<TSpec, TConfig> TFragmentStore;
1332 //    typedef typename Id<TFragmentStore>::Type TId;
1333 //    typedef typename Size<TFragmentStore>::Type TSize;
1334 //    //typedef typename Value<TFile>::Type TValue;
1335 //    typedef typename TFragmentStore::TContigPos TPos;
1336 //    typedef typename TFragmentStore::TReadSeq TReadSeq;
1337 //
1338 //    // All fragment store element types
1339 //    typedef typename Value<typename TFragmentStore::TContigStore>::Type TContigElement;
1340 //    typedef typename Value<typename TFragmentStore::TLibraryStore>::Type TLibraryStoreElement;
1341 //    typedef typename Value<typename TFragmentStore::TMatePairStore>::Type TMatePairElement;
1342 //    typedef typename Value<typename TFragmentStore::TReadStore>::Type TReadStoreElement;
1343 //    typedef typename Value<typename TFragmentStore::TAlignedReadStore>::Type TAlignedElement;
1344 //
1345 //
1346 //    // All maps to mirror file ids to our internal ids
1347 //    typedef std::map<TId, TId> TIdMap;
1348 //    TIdMap libIdMap;
1349 //    TIdMap frgIdMap;
1350 //    TIdMap readIdMap;
1351 //
1352 //    // Create RecordReader object.
1353 //    typename DirectionIterator<TFile, Input>::Type reader = directionIterator(file, Input());
1354 //
1355 //    // Parse the file and convert the internal ids
1356 //    TPos maxPos = 0;
1357 //    TPos minPos = std::numeric_limits<TPos>::max();
1358 //    TId count = 0;
1359 //    if (atEnd(reader))
1360 //        return false;
1361 //    CharString buffer;
1362 //    while (!atEnd(reader))
1363 //    {
1364 //        // New read?
1365 //        if (value(reader) == '>')
1366 //        {
1367 //            TAlignedElement alignEl;
1368 //            TId id = count;
1369 //            TId fragId = TReadStoreElement::INVALID_ID;
1370 //            TId repeatId = 0;
1371 //
1372 //            goNext(reader);
1373 //            if (skipWhitespaces(reader) != 0)
1374 //                return 1;
1375 //
1376 //            // Get the layout positions
1377 //            clear(buffer);
1378 //            if (readDigits(buffer, reader) != 0)
1379 //                return 1;
1380 //            if (!lexicalCast2(alignEl.beginPos, buffer))
1381 //                return 1;
1382 //            goNext(reader);
1383 //            if (skipWhitespaces(reader) != 0)
1384 //                return 1;
1385 //            clear(buffer);
1386 //            if (readDigits(buffer, reader) != 0)
1387 //                return 1;
1388 //            if (!lexicalCast2(alignEl.endPos, buffer))
1389 //                return 1;
1390 //
1391 //            // Any attributes?
1392 //            String<char> eid;
1393 //            String<char> qlt;
1394 //            TReadSeq seq;
1395 //            if (value(reader) == '[')
1396 //            {
1397 //                String<char> fdIdentifier;
1398 //                while (value(reader) != ']')
1399 //                {
1400 //                    goNext(reader);
1401 //                    if (skipWhitespaces(reader) != 0)
1402 //                        return 1;
1403 //                    clear(fdIdentifier);
1404 //                    if (readAlphaNums(fdIdentifier, reader) != 0)
1405 //                        return 1;
1406 //                    goNext(reader);  // Skip "="
1407 //                    if (fdIdentifier == "id")
1408 //                    {
1409 //                        clear(buffer);
1410 //                        if (readDigits(buffer, reader) != 0)
1411 //                            return 1;
1412 //                        if (!lexicalCast2(id, buffer))
1413 //                            return 1;
1414 //                    } else if (fdIdentifier == "fragId") {
1415 //                        clear(buffer);
1416 //                        if (readDigits(buffer, reader) != 0)
1417 //                            return 1;
1418 //                        if (!lexicalCast2(fragId, buffer))
1419 //                            return 1;
1420 //                    } else if (fdIdentifier == "repeatId") {
1421 //                        clear(buffer);
1422 //                        if (readDigits(buffer, reader) != 0)
1423 //                            return 1;
1424 //                        if (!lexicalCast2(repeatId, buffer))
1425 //                            return 1;
1426 //                    } else if (fdIdentifier == "eid") {
1427 //                        if (readUntilOneOf(eid, reader, ',', ']') != 0)
1428 //                            return 1;
1429 //                    } else if (fdIdentifier == "qlt") {
1430 //                        if (readUntilOneOf(qlt, reader, ',', ']') != 0)
1431 //                            return 1;
1432 //                    } else {
1433 //                        // Jump to next attribute
1434 //                        // TODO(holtgrew): Add skipUntilOneOf()?
1435 //                        if (readUntilOneOf(buffer, reader, ',', ']') != 0)
1436 //                            return 1;
1437 //                    }
1438 //                }
1439 //            }
1440 //            if (skipLine(reader) != 0)
1441 //                return 1;
1442 //            if (skipWhitespaces(reader) != 0)
1443 //                return 1;
1444 //            while (!atEnd(reader) && value(reader) != '>')
1445 //            {
1446 //                if (readLetters(seq, reader) != 0)
1447 //                    return 1;
1448 //                int res = skipWhitespaces(reader);
1449 //                if (res != 0 && res != EOF_BEFORE_SUCCESS)
1450 //                    return 1;
1451 //            }
1452 //
1453 //            // Set quality
1454 //            typedef typename Iterator<TReadSeq, Standard>::Type TReadIter;
1455 //            typedef typename Iterator<String<char> >::Type TQualIter;
1456 //            TReadIter begIt = begin(seq, Standard() );
1457 //            TReadIter begItEnd = begin(seq, Standard() );
1458 //            if (length(qlt)) {
1459 //                TQualIter qualIt = begin(qlt);
1460 //                TQualIter qualItEnd = end(qlt);
1461 //                for(;qualIt != qualItEnd; goNext(qualIt), goNext(begIt)) assignQualityValue(value(begIt), value(qualIt));
1462 //            } else {
1463 //                for(;begIt != begItEnd; goNext(begIt)) assignQualityValue(value(begIt), 'D');
1464 //            }
1465 //
1466 //            // Set eid if not given
1467 //            if (empty(eid)) {
1468 //                std::stringstream input;
1469 //                input << "R" << id;
1470 //                input << "-" << repeatId;
1471 //                eid = input.str().c_str();
1472 //            }
1473 //
1474 //            // Insert the read
1475 //            readIdMap.insert(std::make_pair(id, static_cast<TId>(length(fragStore.readStore))));
1476 //            appendRead(fragStore, seq, fragId);
1477 //            appendValue(fragStore.readNameStore, eid, Generous());
1478 //
1479 //            // Insert an aligned read
1480 //            TSize readLen = length(seq);
1481 //            if (alignEl.beginPos < alignEl.endPos) {
1482 //                if ((TPos)readLen != alignEl.endPos - alignEl.beginPos) {
1483 //                    alignEl.endPos = alignEl.beginPos + readLen;
1484 //                }
1485 //                if (alignEl.beginPos < minPos) minPos = alignEl.beginPos;
1486 //                if (alignEl.endPos > maxPos) maxPos = alignEl.endPos;
1487 //            } else {
1488 //                if ((TPos)readLen != alignEl.beginPos - alignEl.endPos) {
1489 //                    alignEl.beginPos = alignEl.endPos + readLen;
1490 //                }
1491 //                if (alignEl.endPos < minPos) minPos = alignEl.endPos;
1492 //                if (alignEl.beginPos > maxPos) maxPos = alignEl.beginPos;
1493 //            }
1494 //            alignEl.readId = id;
1495 //            alignEl.pairMatchId =  fragId;
1496 //            alignEl.contigId = 0;
1497 //            alignEl.id = length(fragStore.alignedReadStore);
1498 //            appendValue(fragStore.alignedReadStore, alignEl, Generous());
1499 //            ++count;
1500 //        } else {
1501 //            if (skipLine(reader) != 0)
1502 //                return 1;
1503 //        }
1504 //    }
1505 //
1506 //    // Read contig or reference sequence
1507 //    TContigElement contigEl;
1508 //    std::string fileName = filePath + 'S';
1509 //    SeqFileIn strmRef;
1510 //    String<char> contigEid = "C0";
1511 //    if (open(strmRef, fileName.c_str()))
1512 //    {
1513 //        typename DirectionIterator<TFile, Input>::Type readerRef = directionIterator(strmRef, Input());
1514 //        clear(contigEid);
1515 //        readRecord(contigEid, contigEl.seq, strmRef);
1516 //        close(strmRef);
1517 //    }
1518 //    if (empty(contigEl.seq)) {
1519 //        typedef typename TFragmentStore::TContigGapAnchor TContigGapAnchor;
1520 //        if (moveToFront) appendValue(contigEl.gaps, TContigGapAnchor(0, maxPos - minPos), Generous());
1521 //        else appendValue(contigEl.gaps, TContigGapAnchor(0, maxPos), Generous());
1522 //    }
1523 //    appendValue(fragStore.contigStore, contigEl, Generous());
1524 //    appendValue(fragStore.contigNameStore, contigEid, Generous());
1525 //
1526 //
1527 //    // Read fragments
1528 //    fileName = filePath + 'F';
1529 //    FILE* strmFrag = fopen(fileName.c_str(), "rb");
1530 //    if (!strmFrag)
1531 //        return 1;
1532 //    RecordReader<FILE *, SinglePass<> > readerFrag(strmFrag);
1533 //    while (!atEnd(readerFrag))
1534 //    {
1535 //        if (value(readerFrag) == '>')
1536 //        {
1537 //            TMatePairElement matePairEl;
1538 //            goNext(readerFrag);
1539 //            if (skipWhitespaces(readerFrag) != 0)
1540 //                return 1;
1541 //
1542 //            // Get the fragment id
1543 //            clear(buffer);
1544 //            if (readDigits(buffer, readerFrag) != 0)
1545 //                return 1;
1546 //            TId id = 0;
1547 //            if (!lexicalCast2(id, buffer))
1548 //                return 1;
1549 //
1550 //            // Any attributes?
1551 //            std::stringstream input;
1552 //            input << "F" << id;
1553 //            String<char> eid(input.str().c_str());
1554 //            if (value(readerFrag) == '[')
1555 //            {
1556 //                String<char> fdIdentifier;
1557 //                while (value(readerFrag) != ']')
1558 //                {
1559 //                    goNext(readerFrag);
1560 //                    if (skipWhitespaces(readerFrag) != 0)
1561 //                        return 1;
1562 //                    clear(fdIdentifier);
1563 //                    if (readAlphaNums(fdIdentifier, readerFrag) != 0)
1564 //                        return 1;
1565 //                    goNext(readerFrag);
1566 //                    if (fdIdentifier == "libId")
1567 //                    {
1568 //                        clear(buffer);
1569 //                        if (readDigits(buffer, readerFrag) != 0)
1570 //                            return 1;
1571 //                        if (!lexicalCast2(matePairEl.libId, buffer))
1572 //                            return 1;
1573 //                    } else if (fdIdentifier == "eid") {
1574 //                        clear(eid);
1575 //                        if (readUntilOneOf(eid, readerFrag, ',', ']') != 0)
1576 //                            return 1;
1577 //                    } else {
1578 //                        // Jump to next attribute
1579 //                        if (readUntilOneOf(buffer, readerFrag, ',', ']') != 0)
1580 //                            return 1;
1581 //                    }
1582 //                }
1583 //            }
1584 //            if (skipLine(readerFrag) != 0)
1585 //                return 1;
1586 //
1587 //            // Read the two reads belonging to this mate pair
1588 //            clear(buffer);
1589 //            for (int i = 0; i < 2; ++i)
1590 //            {
1591 //                if (skipWhitespaces(readerFrag) != 0)
1592 //                    return 1;
1593 //                clear(buffer);
1594 //                if (readDigits(buffer, readerFrag) != 0)
1595 //                    return 1;
1596 //                if (!lexicalCast2(matePairEl.readId[i], buffer))
1597 //                    return 1;
1598 //                if (!i)  // Skip ','.
1599 //                    goNext(readerFrag);
1600 //            }
1601 //            int res = skipLine(readerFrag);
1602 //            if (res != 0 && res != EOF_BEFORE_SUCCESS)
1603 //                return 1;
1604 //
1605 //            // Insert mate pair
1606 //            if (matePairEl.readId[0] != matePairEl.readId[1]) {
1607 //                frgIdMap.insert(std::make_pair(id, static_cast<TId>(length(fragStore.matePairStore))));
1608 //                appendValue(fragStore.matePairStore, matePairEl, Generous());
1609 //                appendValue(fragStore.matePairNameStore, eid, Generous());
1610 //            }
1611 //        } else {
1612 //            int res = skipLine(readerFrag);
1613 //            if (res != 0 && res != EOF_BEFORE_SUCCESS)
1614 //                return 1;
1615 //        }
1616 //    }
1617 //    fclose(strmFrag);
1618 //
1619 //    // Read libraries
1620 //    fileName = filePath + 'L';
1621 //    FILE* strmLib = fopen(fileName.c_str(), "rb");
1622 //    if (!strmLib)
1623 //        return 1;
1624 //    RecordReader<FILE *, SinglePass<> > readerLib(strmLib);
1625 //    while (!atEnd(readerLib))
1626 //    {
1627 //        if (value(readerLib) == '>')
1628 //        {
1629 //            TLibraryStoreElement libEl;
1630 //            goNext(readerLib);
1631 //            if (skipWhitespaces(readerLib) != 0)
1632 //                return 1;
1633 //
1634 //            // Get the fragment id
1635 //            clear(buffer);
1636 //            if (readDigits(buffer, readerLib) != 0)
1637 //                return 1;
1638 //            TId id = 0;
1639 //            if (!lexicalCast2(id, buffer))
1640 //                return 1;
1641 //
1642 //            // Any attributes?
1643 //            std::stringstream input;
1644 //            input << "L" << id;
1645 //            String<char> eid(input.str().c_str());
1646 //            if (value(readerLib) == '[')
1647 //            {
1648 //                String<char> fdIdentifier;
1649 //                while (value(readerLib) != ']')
1650 //                {
1651 //                    goNext(readerLib);
1652 //                    if (skipWhitespaces(readerLib) != 0)
1653 //                        return 1;
1654 //                    clear(fdIdentifier);
1655 //                    if (readAlphaNums(fdIdentifier, readerLib) != 0)
1656 //                        return 1;
1657 //                    if (fdIdentifier == "eid")
1658 //                    {
1659 //                        clear(eid);
1660 //                        if (readUntilOneOf(eid, readerLib, ',', ']') != 0)
1661 //                            return 1;
1662 //                    } else {
1663 //                        // Jump to next attribute
1664 //                        // TODO(holtgrew): skipUntilOneOf()?
1665 //                        if (readUntilOneOf(buffer, readerLib, ',', ']') != 0)
1666 //                            return 1;
1667 //                    }
1668 //                }
1669 //            }
1670 //            if (skipLine(readerLib) != 0)
1671 //                return 1;
1672 //            if (skipWhitespaces(readerLib) != 0)
1673 //                return 1;
1674 //
1675 //            // Read the mean and standard deviation
1676 //            clear(buffer);
1677 //            if (readDigits(buffer, readerLib) != 0)
1678 //                return 1;
1679 //            if (!lexicalCast2(libEl.mean, buffer))
1680 //                return 1;
1681 //            if (skipWhitespaces(readerLib) != 0)
1682 //                return 1;
1683 //            goNext(readerLib);
1684 //            clear(buffer);
1685 //            if (readDigits(buffer, readerLib) != 0)
1686 //                return 1;
1687 //            if (!lexicalCast2(libEl.std, buffer))
1688 //                return 1;
1689 //            int res = skipLine(readerLib);
1690 //            if (res != 0 && res != EOF_BEFORE_SUCCESS)
1691 //                return 1;
1692 //
1693 //            // Insert mate pair
1694 //            libIdMap.insert(std::make_pair(id, static_cast<TId>(length(fragStore.libraryStore))));
1695 //            appendValue(fragStore.libraryStore, libEl, Generous());
1696 //            appendValue(fragStore.libraryNameStore, eid, Generous());
1697 //        } else {
1698 //            int res = skipLine(readerLib);
1699 //            if (res != 0 && res != EOF_BEFORE_SUCCESS)
1700 //                return 1;
1701 //        }
1702 //    }
1703 //    fclose(strmLib);
1704 //
1705 //    // Renumber all ids
1706 //    typedef typename TIdMap::const_iterator TIdMapIter;
1707 //    typedef typename Iterator<typename TFragmentStore::TMatePairStore>::Type TMateIter;
1708 //    TMateIter mateIt = begin(fragStore.matePairStore);
1709 //    TMateIter mateItEnd = end(fragStore.matePairStore);
1710 //    for(;mateIt != mateItEnd; goNext(mateIt)) {
1711 //        if (mateIt->libId != TMatePairElement::INVALID_ID) {
1712 //            TIdMapIter libIdPos = libIdMap.find(mateIt->libId);
1713 //            if (libIdPos != libIdMap.end()) mateIt->libId = libIdPos->second;
1714 //            else mateIt->libId = TMatePairElement::INVALID_ID;
1715 //        }
1716 //        if (mateIt->readId[0] != TMatePairElement::INVALID_ID) {
1717 //            TIdMapIter readIdPos = readIdMap.find(mateIt->readId[0]);
1718 //            if (readIdPos != readIdMap.end()) mateIt->readId[0] = readIdPos->second;
1719 //            else mateIt->readId[0] = TMatePairElement::INVALID_ID;
1720 //        }
1721 //        if (mateIt->readId[1]!= TMatePairElement::INVALID_ID) {
1722 //            TIdMapIter readIdPos = readIdMap.find(mateIt->readId[1]);
1723 //            if (readIdPos != readIdMap.end()) mateIt->readId[1] = readIdPos->second;
1724 //            else mateIt->readId[0] = TMatePairElement::INVALID_ID;
1725 //        }
1726 //    }
1727 //    typedef typename Iterator<typename TFragmentStore::TReadStore>::Type TReadIter;
1728 //    TReadIter readIt = begin(fragStore.readStore);
1729 //    TReadIter readItEnd = end(fragStore.readStore);
1730 //    for(;readIt != readItEnd; goNext(readIt)) {
1731 //        if (readIt->matePairId != TReadStoreElement::INVALID_ID) {
1732 //            TIdMapIter mateIdPos = frgIdMap.find(readIt->matePairId);
1733 //            if (mateIdPos != frgIdMap.end()) readIt->matePairId = mateIdPos->second;
1734 //            else readIt->matePairId = TReadStoreElement::INVALID_ID;
1735 //        }
1736 //    }
1737 //    typedef typename Iterator<typename TFragmentStore::TAlignedReadStore>::Type TAlignIter;
1738 //    TAlignIter alignIt = begin(fragStore.alignedReadStore);
1739 //    TAlignIter alignItEnd = end(fragStore.alignedReadStore);
1740 //    for(;alignIt != alignItEnd; goNext(alignIt)) {
1741 //        if (alignIt->readId != TAlignedElement::INVALID_ID) {
1742 //            TIdMapIter readIdPos = readIdMap.find(alignIt->readId);
1743 //            if (readIdPos != readIdMap.end()) alignIt->readId = readIdPos->second;
1744 //            else alignIt->readId = TAlignedElement::INVALID_ID;
1745 //        }
1746 //        if (moveToFront) {
1747 //            alignIt->beginPos -= minPos;
1748 //            alignIt->endPos -= minPos;
1749 //        }
1750 //    }
1751 //    return 0;
1752 }
1753 
1754 
1755 //////////////////////////////////////////////////////////////////////////////
1756 // Rudimentary write functions for CeleraFrg and Celera Cgb
1757 //////////////////////////////////////////////////////////////////////////////
1758 
1759 template<typename TFile, typename TSpec, typename TConfig>
1760 inline void
_writeCeleraFrg(TFile & file,FragmentStore<TSpec,TConfig> & fragStore)1761 _writeCeleraFrg(TFile& file,
1762                 FragmentStore<TSpec, TConfig>& fragStore)
1763 {
1764 //IOREV _nodoc_
1765     typedef FragmentStore<TSpec, TConfig> TFragmentStore;
1766     typedef typename Size<TFragmentStore>::Type TSize;
1767     typedef typename TFragmentStore::TReadPos TReadPos;
1768 
1769     typename DirectionIterator<TFile, Output>::Type target = directionIterator(file, Output());
1770 
1771     // Iterate over all aligned reads to get the clear ranges
1772     typedef Pair<TReadPos, TReadPos> TClrRange;
1773     String<TClrRange> clearStr;
1774     resize(clearStr, length(fragStore.readStore));
1775     typedef typename Iterator<typename TFragmentStore::TAlignedReadStore>::Type TAlignIter;
1776     TAlignIter alignIt = begin(fragStore.alignedReadStore);
1777     TAlignIter alignItEnd = end(fragStore.alignedReadStore);
1778     for(;alignIt != alignItEnd; goNext(alignIt))
1779     {
1780         TReadPos begClr = 0;
1781         TReadPos endClr = 0;
1782         getClrRange(fragStore, value(alignIt), begClr, endClr);
1783         value(clearStr, alignIt->readId) = TClrRange(begClr, endClr);
1784     }
1785 
1786     // Write Reads
1787     typedef typename Iterator<typename TFragmentStore::TReadStore>::Type TReadIter;
1788     TReadIter readIt = begin(fragStore.readStore);
1789     TReadIter readItEnd = end(fragStore.readStore);
1790     bool noNamesPresent = (length(fragStore.readNameStore) == 0);
1791     for (TSize idCount = 0; readIt != readItEnd; goNext(readIt), ++idCount)
1792     {
1793         write(target, "{FRG\nact:A\nacc:");
1794         appendNumber(target, idCount + 1);
1795         write(target, "\ntyp:R\n");
1796         if (!noNamesPresent)
1797         {
1798             write(target, "src:\n");
1799             write(target, fragStore.readNameStore[idCount]);
1800             write(target, "\n.\n");
1801         }
1802         write(target, "etm:0\nseq:\n");
1803         writeWrappedString(target, fragStore.readSeqStore[idCount], 70);
1804         write(target, ".\nqlt:\n");
1805         typedef typename Value<typename TFragmentStore::TReadSeqStore>::Type TReadSeq;
1806         typedef QualityExtractor<typename Value<TReadSeq>::Type> TQualityExtractor;
1807         ModifiedString<TReadSeq const, ModView<TQualityExtractor> > quals(fragStore.readSeqStore[idCount]);
1808         writeWrappedString(target, quals, 70);
1809         // Note: Clear range does not have to be ordered, e.g. no indication for reverse complemented reads, this is happening in cgb records
1810         write(target, ".\nclr:");
1811         appendNumber(target, clearStr[idCount].i1);
1812         writeValue(target, ',');
1813         appendNumber(target, clearStr[idCount].i2);
1814         write(target, "\n}\n");
1815     }
1816 }
1817 
1818 
1819 //////////////////////////////////////////////////////////////////////////////
1820 
1821 // Writes out the first contig only.
1822 
1823 // TODO(holtgrew): This is only used in seqcons, should it go into app?
1824 template<typename TFile, typename TSpec, typename TConfig>
1825 inline void
_writeCeleraCgb(TFile & file,FragmentStore<TSpec,TConfig> & fragStore)1826 _writeCeleraCgb(TFile& file,
1827                 FragmentStore<TSpec, TConfig>& fragStore)
1828 {
1829     typedef FragmentStore<TSpec, TConfig> TFragmentStore;
1830     typedef typename Size<TFragmentStore>::Type TSize;
1831     typedef typename Id<TFragmentStore>::Type TId;
1832     //typedef typename TFragmentStore::TReadPos TReadPos;
1833 
1834     typename DirectionIterator<TFile, Output>::Type target = directionIterator(file, Output());
1835 
1836     // Write the first contig
1837     TId contigId = 0;
1838 
1839     // Sort the reads according to position
1840     sortAlignedReads(fragStore.alignedReadStore, SortBeginPos());
1841 
1842     // Write Header
1843     write(target, "{IUM\nacc:0\nsrc:\ngen> @@ [0,0]\n.\ncov:0.000\nsta:X\nfur:X\nabp:0\nbbp:0\nlen:");
1844     appendNumber(target, length((value(fragStore.contigStore, contigId)).seq));
1845     write(target, "\ncns:\n.\nqlt:\n.\nfor:0\nnfr:");
1846     appendNumber(target, length(fragStore.readStore));
1847     writeValue(target, '\n');
1848 
1849     // Write reads
1850     typedef typename Iterator<typename TFragmentStore::TAlignedReadStore>::Type TAlignIter;
1851     TAlignIter alignIt = begin(fragStore.alignedReadStore);
1852     TAlignIter alignItEnd = end(fragStore.alignedReadStore);
1853     TSize offsetLeft = _min(alignIt->beginPos, alignIt->endPos);
1854     for (;alignIt != alignItEnd; goNext(alignIt))
1855     {
1856         if (contigId != alignIt->contigId)
1857             continue;
1858         write(target, "{IMP\ntyp:R\nmid:");
1859         appendNumber(target, alignIt->readId + 1);
1860         write(target, "\ncon:0\npos:");
1861         appendNumber(target, alignIt->beginPos - offsetLeft);
1862         writeValue(target, ',');
1863         appendNumber(target, alignIt->endPos - offsetLeft);
1864         write(target, "\ndln:0\ndel:\n}\n");
1865     }
1866     write(target, "}\n");
1867 }
1868 
1869 }  // namespace seqan
1870 
1871 #endif //#ifndef SEQAN_HEADER_...
1872