1 // ==========================================================================
2 // SeqAn - The Library for Sequence Analysis
3 // ==========================================================================
4 // Copyright (c) 2006-2015, Knut Reinert, FU Berlin
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are met:
9 //
10 // * Redistributions of source code must retain the above copyright
11 // notice, this list of conditions and the following disclaimer.
12 // * Redistributions in binary form must reproduce the above copyright
13 // notice, this list of conditions and the following disclaimer in the
14 // documentation and/or other materials provided with the distribution.
15 // * Neither the name of Knut Reinert or the FU Berlin nor the names of
16 // its contributors may be used to endorse or promote products derived
17 // from this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29 // DAMAGE.
30 //
31 // ==========================================================================
32 // Author: Tobias Rausch <rausch@embl.de>
33 // ==========================================================================
34
35 #ifndef SEQAN_HEADER_CONSENSUS_BASE_H
36 #define SEQAN_HEADER_CONSENSUS_BASE_H
37
38 namespace SEQAN_NAMESPACE_MAIN
39 {
40
41
42 //////////////////////////////////////////////////////////////////////////////
43 // Segment Match Generation tag
44 //////////////////////////////////////////////////////////////////////////////
45
46 // TODO(holtgrew): This apparently belongs into graph_msa?
47
48 struct OverlapLibrary_;
49 typedef Tag<OverlapLibrary_> const OverlapLibrary;
50
51
52
53 //////////////////////////////////////////////////////////////////////////////
54 // Consensus tag
55 //////////////////////////////////////////////////////////////////////////////
56
57 /*
58 * @defgroup ConsensusCallingTags Consensus Calling Tags
59 * @brief Tags for consensus calling.
60 *
61 * @tag ConsensusCallingTags#MajorityVote
62 * @headerfile <seqan/consensus.h>
63 * @brief Consensus based on the most common character.
64 *
65 * @tag ConsensusCallingTags#Bayesian
66 * @headerfile <seqan/consensus.h>
67 * @brief Consensus based on a bayesian probability.
68 */
69
70
71 struct MajorityVote_;
72 typedef Tag<MajorityVote_> const MajorityVote;
73
74 struct Bayesian_;
75 typedef Tag<Bayesian_> const Bayesian;
76
77
78
79 //////////////////////////////////////////////////////////////////////////////
80 // Read alignment and Consensus Generation
81 //////////////////////////////////////////////////////////////////////////////
82
83
84 //////////////////////////////////////////////////////////////////////////////////
85
86 struct ConsensusOptions {
87 public:
88 // Method
89 // 0: graph-based multiple sequence alignment
90 // 1: realign
91 int method;
92
93 // ReAlign Method
94 // 0: Needleman-Wunsch
95 // 1: Gotoh
96 int rmethod;
97
98 // Bandwidth of overlap alignment
99 int bandwidth;
100
101 // Number of computed overlaps per read (at the beginning and end of a read)
102 int overlaps;
103
104 // Minimum match length of a computed overlap
105 int matchlength;
106
107 // Minimum quality (in percent identity) of a computed overlap
108 int quality;
109
110 // Window size, only relevant for insert sequencing
111 // If window == 0, no insert sequencing is assumed
112 int window;
113
114 // Output
115 // 0: seqan style
116 // 1: afg output format
117 // 2: frg output format
118 // 3: cgb output format
119 // 4: Sam output format
120 int output;
121
122 // Multi-read alignment
123 bool noalign;
124
125 // Offset all reads, so the first read starts at position 0
126 bool moveToFront;
127
128 // Include reference genome
129 bool include;
130
131 // Scoring object for overlap alignments
132 Score<int> sc;
133
134 // Various input and output files
135 std::string readsfile; // File of reads in FASTA format
136 std::string afgfile; // AMOS afg file input
137 std::string samfile; // Sam file input
138 std::string contigsfile; // FASTA reference file for Sam input
139 std::string outfile; // Output file name
140
141 // Initialization
ConsensusOptionsConsensusOptions142 ConsensusOptions() :
143 method(0), rmethod(0), bandwidth(0), overlaps(0), matchlength(0), quality(0),
144 window(0), output(0), noalign(false), moveToFront(0), include(0), sc(2, -6, -4, -9)
145 {}
146 };
147
148
149 //////////////////////////////////////////////////////////////////////////////////
150
151 // Copy reads for the whole contig out of fragStore and into strSet. The start and end
152 // positions of the alignments go into startEndPos.
153
154 template<typename TValue, typename TStrSpec, typename TPosPair, typename TStringSpec, typename TSpec, typename TConfig, typename TId>
155 inline void
_loadContigReads(StringSet<TValue,Owner<TStrSpec>> & strSet,String<TPosPair,TStringSpec> & startEndPos,FragmentStore<TSpec,TConfig> const & fragStore,TId const contigId)156 _loadContigReads(StringSet<TValue, Owner<TStrSpec> > & strSet,
157 String<TPosPair, TStringSpec> & startEndPos,
158 FragmentStore<TSpec, TConfig> const & fragStore,
159 TId const contigId)
160 {
161 typedef FragmentStore<TSpec, TConfig> const TFragmentStore;
162 typedef typename Size<TFragmentStore>::Type TSize;
163 typedef typename TFragmentStore::TReadPos TReadPos;
164
165
166 // Sort aligned reads according to contig id
167 sortAlignedReads(fragStore.alignedReadStore, SortContigId());
168 resize(strSet, length(fragStore.alignedReadStore));
169
170 // Retrieve all reads, limit them to the clear range and if required reverse complement them
171 typedef typename Iterator<typename TFragmentStore::TAlignedReadStore const>::Type TAlignIter;
172 TAlignIter alignIt = lowerBoundAlignedReads(fragStore.alignedReadStore, contigId, SortContigId());
173 TAlignIter alignItEnd = upperBoundAlignedReads(fragStore.alignedReadStore, contigId, SortContigId());
174 TSize numRead = 0;
175 TReadPos begClr = 0;
176 TReadPos endClr = 0;
177 TSize lenRead = 0;
178 TSize offset = 0;
179 for(;alignIt != alignItEnd; ++alignIt) {
180 offset = _min(alignIt->beginPos, alignIt->endPos);
181 getClrRange(fragStore, *alignIt, begClr, endClr);
182 strSet[numRead] = infix(fragStore.readSeqStore[alignIt->readId], begClr, endClr);
183 lenRead = endClr - begClr;
184 if (alignIt->beginPos < alignIt->endPos) appendValue(startEndPos, TPosPair(offset, offset + lenRead), Generous());
185 else {
186 reverseComplement(strSet[numRead]);
187 appendValue(startEndPos, TPosPair(offset + lenRead, offset), Generous());
188 }
189 ++numRead;
190 }
191 resize(strSet, numRead, Exact());
192 }
193
194
195
196
197 //////////////////////////////////////////////////////////////////////////////
198
199 template<typename TSpec, typename TConfig, typename TMatrix, typename TSize2, typename TSize, typename TReadSlot>
200 inline bool
convertAlignment(FragmentStore<TSpec,TConfig> & fragStore,TMatrix & mat,TSize2 contigId,TSize & coverage,TReadSlot & slot)201 convertAlignment(FragmentStore<TSpec, TConfig>& fragStore,
202 TMatrix& mat,
203 TSize2 contigId,
204 TSize& coverage,
205 TReadSlot& slot)
206 {
207 typedef FragmentStore<TSpec, TConfig> TFragmentStore;
208 typedef typename Value<TMatrix>::Type TValue;
209 typedef typename TFragmentStore::TContigPos TContigPos;
210
211 // Gap char
212 TValue gapChar = gapValue<TValue>();
213
214 // Sort according to contigId
215 sortAlignedReads(fragStore.alignedReadStore, SortContigId());
216
217 // Find range of the given contig
218 typedef typename Iterator<typename TFragmentStore::TAlignedReadStore, Standard>::Type TAlignIter;
219 TAlignIter alignIt = lowerBoundAlignedReads(fragStore.alignedReadStore, contigId, SortContigId());
220 TAlignIter alignItEnd = upperBoundAlignedReads(fragStore.alignedReadStore, contigId, SortContigId());
221
222 // Sort the reads according to the begin position
223 sortAlignedReads(infix(fragStore.alignedReadStore, alignIt - begin(fragStore.alignedReadStore, Standard()), alignItEnd - begin(fragStore.alignedReadStore, Standard())), SortBeginPos());
224 TAlignIter alignItBegin = alignIt = lowerBoundAlignedReads(fragStore.alignedReadStore, contigId, SortContigId());
225 alignItEnd = upperBoundAlignedReads(fragStore.alignedReadStore, contigId, SortContigId());
226
227 // Get the maximum coverage and the slot for each read
228 typedef String<TSize> TFirstFreePos;
229 typedef typename Iterator<TFirstFreePos, Standard>::Type TPosIter;
230 TFirstFreePos freePos;
231 TSize pos = 0;
232 TSize maxTmp = 0;
233 TSize numCol = 0;
234 reserve(slot, alignItEnd - alignIt);
235 for(;alignIt != alignItEnd; ++alignIt) {
236 TPosIter itPos = begin(freePos, Standard());
237 TPosIter itPosEnd = end(freePos, Standard());
238 pos = 0;
239 for(;itPos != itPosEnd; ++itPos, ++pos)
240 if ((TContigPos)*itPos < _min(alignIt->beginPos, alignIt->endPos)) break;
241 if (pos + 1 > length(freePos)) resize(freePos, pos+1, Generous());
242 maxTmp = _max(alignIt->beginPos, alignIt->endPos);
243 freePos[pos] = maxTmp;
244 if (maxTmp > numCol) numCol = maxTmp;
245 appendValue(slot, pos);
246 }
247 coverage = length(freePos);
248 clear(freePos);
249
250 // Fill the matrix
251 typedef typename Iterator<TMatrix, Standard>::Type TMatIter;
252 resize(mat, coverage * numCol, '.');
253 alignIt = alignItBegin;
254 TSize readPos = 0;
255 TMatIter matIt = begin(mat, Standard());
256 typename TFragmentStore::TReadSeq myRead;
257 for(;alignIt != alignItEnd; ++alignIt, ++readPos) {
258 typedef typename Iterator<String<typename TFragmentStore::TReadGapAnchor>, Standard>::Type TReadGapsIter;
259 TReadGapsIter itGaps = begin(alignIt->gaps, Standard());
260 TReadGapsIter itGapsEnd = end(alignIt->gaps, Standard());
261
262 // Place each read inside the matrix
263 myRead = fragStore.readSeqStore[alignIt->readId];
264 TSize lenRead = length(myRead);
265 TSize offset = alignIt->beginPos;
266 if (alignIt->beginPos > alignIt->endPos) {
267 reverseComplement(myRead);
268 offset = alignIt->endPos;
269 }
270 matIt = begin(mat, Standard());
271 matIt += (slot[readPos] * numCol + offset);
272
273 typedef typename Iterator<typename TFragmentStore::TReadSeq, Standard>::Type TReadIter;
274 TReadIter seqReadIt = begin(myRead, Standard());
275
276 // First clear range
277 TSize mySeqPos = 0;
278 int diff = 0;
279 if ((itGaps != itGapsEnd) && (itGaps->gapPos == 0)) {
280 mySeqPos = itGaps->seqPos;
281 diff = -1 * mySeqPos;
282 seqReadIt += mySeqPos;
283 }
284 TSize clr2 = lenRead;
285 TSize stop = 0;
286 for(;itGaps != itGapsEnd; ++itGaps) {
287 // Any clipped sequence at the end
288 stop = itGaps->seqPos;
289 if (diff - ((int) itGaps->gapPos - (int) itGaps->seqPos) > 0)
290 clr2 = stop = lenRead - (diff - ((int) itGaps->gapPos - (int) itGaps->seqPos));
291
292 for(;mySeqPos < stop; ++matIt, ++seqReadIt, ++mySeqPos)
293 *matIt = *seqReadIt;
294
295 for(int i = 0; i < ((int) itGaps->gapPos - (int) itGaps->seqPos) - diff; ++i, ++matIt)
296 *matIt = gapChar;
297
298 diff = (itGaps->gapPos - itGaps->seqPos);
299 }
300 for(;mySeqPos < clr2; ++mySeqPos, ++seqReadIt, ++matIt)
301 *matIt = *seqReadIt;
302 }
303 //for(TSize row = 0; row < coverage; ++row) {
304 // for(TSize col = 0; col<numCol; ++col) {
305 // std::cout << mat[row * numCol + col];
306 // }
307 // std::cout << std::endl;
308 //}
309 return true;
310 }
311
312 //////////////////////////////////////////////////////////////////////////////
313
314 template<typename TSpec, typename TConfig, typename TMatrix, typename TSize2, typename TSize>
315 inline bool
convertAlignment(FragmentStore<TSpec,TConfig> & fragStore,TMatrix & mat,TSize2 contigId,TSize & coverage)316 convertAlignment(FragmentStore<TSpec, TConfig>& fragStore,
317 TMatrix& mat,
318 TSize2 contigId,
319 TSize& coverage)
320 {
321 String<TSize> slot;
322 return convertAlignment(fragStore, mat, contigId, coverage, slot);
323 }
324
325 //////////////////////////////////////////////////////////////////////////////
326
327 template<typename TSpec, typename TConfig, typename TMatrix>
328 inline bool
convertAlignment(FragmentStore<TSpec,TConfig> & fragStore,TMatrix & mat)329 convertAlignment(FragmentStore<TSpec, TConfig>& fragStore,
330 TMatrix& mat)
331 {
332 typedef FragmentStore<TSpec, TConfig> TFragmentStore;
333 typedef typename Size<TFragmentStore>::Type TSize;
334 TSize coverage;
335 return convertAlignment(fragStore, mat, 0, coverage);
336 }
337
338 //////////////////////////////////////////////////////////////////////////////
339
340 template<typename TSpec, typename TConfig, typename TGappedConsensus, typename TSize>
341 inline void
getGappedConsensus(FragmentStore<TSpec,TConfig> & fragStore,TGappedConsensus & gappedConsensus,TSize contigId)342 getGappedConsensus(FragmentStore<TSpec, TConfig>& fragStore,
343 TGappedConsensus& gappedConsensus,
344 TSize contigId)
345 {
346 typedef FragmentStore<TSpec, TConfig> TFragmentStore;
347 typedef typename Value<TGappedConsensus>::Type TValue;
348 typedef typename TFragmentStore::TContigPos TContigPos;
349
350 TValue gapChar = gapValue<TValue>();
351 typedef typename Iterator<typename TFragmentStore::TContigSeq, Standard>::Type TContigIter;
352 TContigIter seqContigIt = begin(fragStore.contigStore[contigId].seq, Standard());
353 TContigIter seqContigItEnd = end(fragStore.contigStore[contigId].seq, Standard());
354 typedef typename Iterator<String<typename TFragmentStore::TContigGapAnchor>, Standard>::Type TGapsIter;
355 TGapsIter itGaps = begin(fragStore.contigStore[contigId].gaps, Standard());
356 TGapsIter itGapsEnd = end(fragStore.contigStore[contigId].gaps, Standard());
357 int diff = 0;
358 TContigPos mySeqPos = 0;
359 for(;itGaps != itGapsEnd; goNext(itGaps)) {
360 for(;mySeqPos < itGaps->seqPos; ++seqContigIt, ++mySeqPos)
361 appendValue(gappedConsensus, *seqContigIt, Generous());
362
363 for(int i = 0; i < ((int) itGaps->gapPos - (int) itGaps->seqPos) - diff; ++i)
364 appendValue(gappedConsensus, gapChar, Generous());
365 diff = (itGaps->gapPos - itGaps->seqPos);
366 }
367 for(;seqContigIt != seqContigItEnd; ++seqContigIt)
368 appendValue(gappedConsensus, *seqContigIt, Generous());
369 }
370
371
372 //////////////////////////////////////////////////////////////////////////////
373
374 template<typename TSpec, typename TConfig, typename TGappedConsensus, typename TSize>
375 inline void
assignGappedConsensus(FragmentStore<TSpec,TConfig> & fragStore,TGappedConsensus & gappedCons,TSize contigId)376 assignGappedConsensus(FragmentStore<TSpec, TConfig>& fragStore,
377 TGappedConsensus& gappedCons,
378 TSize contigId)
379 {
380 typedef FragmentStore<TSpec, TConfig> TFragmentStore;
381 typedef typename Value<TGappedConsensus>::Type TValue;
382 TValue gapChar = gapValue<TValue>();
383
384 // Update the contig
385 typedef typename Value<typename TFragmentStore::TContigStore>::Type TContigStoreElement;
386 TContigStoreElement& contigEl = fragStore.contigStore[contigId];
387 clear(contigEl.gaps);
388 clear(contigEl.seq);
389
390 // Create the sequence and the gap anchors
391 typedef typename Iterator<TGappedConsensus, Standard>::Type TStringIter;
392 TStringIter seqIt = begin(gappedCons, Standard());
393 TStringIter seqItEnd = end(gappedCons, Standard());
394 typedef typename TFragmentStore::TReadPos TReadPos;
395 typedef typename TFragmentStore::TContigGapAnchor TContigGapAnchor;
396 TReadPos ungappedPos = 0;
397 TReadPos gappedPos = 0;
398 bool gapOpen = false;
399 for(;seqIt != seqItEnd; ++seqIt, ++gappedPos) {
400 if (*seqIt == gapChar) gapOpen = true;
401 else {
402 if (gapOpen) {
403 appendValue(contigEl.gaps, TContigGapAnchor(ungappedPos, gappedPos), Generous());
404 gapOpen = false;
405 }
406 Dna5Q letter = *seqIt;
407 assignQualityValue(letter, 'D');
408 appendValue(contigEl.seq, letter);
409 ++ungappedPos;
410 }
411 }
412 if (gapOpen)
413 appendValue(contigEl.gaps, TContigGapAnchor(ungappedPos, gappedPos), Generous());
414 }
415
416
417
418 //////////////////////////////////////////////////////////////////////////////////
419
420 /*
421 * @fn consensusAlignment
422 * @headerfile <seqan/consensus.h>
423 * @brief Compute consensus alignment.
424 *
425 * @signature void consensusAlignment(alignmentGraph, beginEndPos[, options])
426 *
427 * @param[out] alignmentGraph Alignment graph to build.
428 * @param[in] options Optional settings for the consenus alignment, type: <tt>ConsensusOptions</tt>.
429 * @param[in] beginEndPos Interval start and end position for the read's alignment; <tt>String<Pair<TPos, TPos> ></tt>.
430 *
431 * @section Example
432 *
433 * @code{.cpp}
434 * #include <seqan/sequence.h>
435 * #include <seqan/graph_align.h>
436 * #include <seqan/consensus.h>
437 *
438 * int main()
439 * {
440 * using namespace seqan;
441 *
442 * typedef StringSet<Dna5String> TStringSet;
443 * typedef Graph<Alignment<TStringSet, void, WithoutEdgeId> > TAlignGraph;
444 *
445 * TStringSet readSet;
446 * String<Pair<TSize> > begEndPos;
447 *
448 * appendValue(readSet, "CCCAGTGA");
449 * appendValue(begEndPos, Pair<TSize>(0, 5));
450 * appendValue(readSet, "AGGGACTGT");
451 * appendValue(begEndPos, Pair<TSize>(3, 9));
452 *
453 * TAlignGraph alignmentGraph(readSet);
454 * consensusAlignment(alignmentGraph, begEndPos);
455 *
456 * return 0;
457 * }
458 * @endcode
459 */
460
461 template<typename TStringSet, typename TCargo, typename TSpec, typename TSize, typename TConfigOptions>
462 inline void
consensusAlignment(Graph<Alignment<TStringSet,TCargo,TSpec>> & gOut,String<Pair<TSize,TSize>> & begEndPos,TConfigOptions const & consOpt)463 consensusAlignment(Graph<Alignment<TStringSet, TCargo, TSpec> >& gOut,
464 String<Pair<TSize, TSize> >& begEndPos,
465 TConfigOptions const& consOpt)
466 {
467 typedef Graph<Alignment<TStringSet, TCargo, TSpec> > TOutGraph;
468 typedef typename Id<TOutGraph>::Type TId;
469
470 // Initialization
471 TStringSet& seqSet = stringSet(gOut);
472
473 // Select all overlapping reads and record the diagonals of the band
474 String<Pair<TId, TId> > pList;
475 String<Pair<int, int> > diagList;
476 if (consOpt.window == 0)
477 selectPairsAssembly(seqSet, begEndPos, consOpt.bandwidth, pList, diagList);
478 else
479 selectPairsAllAgainstAll(seqSet, begEndPos, consOpt.window, pList, diagList);
480
481 // Set-up a sparse distance matrix
482 Graph<Undirected<double> > pairGraph;
483
484 // Containers for segment matches and corresponding scores
485 typedef String<Fragment<> > TFragmentString;
486 TFragmentString matches;
487 typedef String<int> TScoreValues;
488 TScoreValues scores;
489
490 // Compute segment matches from global pairwise alignments
491 appendSegmentMatches(seqSet, pList, diagList, begEndPos, consOpt.sc, consOpt.matchlength, consOpt.quality, consOpt.overlaps, matches, scores, pairGraph, OverlapLibrary() );
492 clear(pList);
493 clear(diagList);
494
495 // If there are no alignment matches, return
496 if (!length(matches)) return;
497
498 // Use these segment matches for the initial alignment graph
499 typedef Graph<Alignment<TStringSet, TSize> > TGraph;
500 TGraph g(seqSet);
501 buildAlignmentGraph(matches, scores, g, consOpt.sc, ReScore() );
502 clear(matches);
503 clear(scores);
504
505 // Guide Tree
506 Graph<Tree<double> > guideTree;
507 upgmaTree(pairGraph, guideTree);
508 clear(pairGraph);
509
510 // Triplet library extension
511 graphBasedTripletLibraryExtension(g);
512
513 // Perform a progressive alignment
514 progressiveAlignment(g, guideTree, gOut);
515 clear(g);
516 clear(guideTree);
517 }
518
519 //////////////////////////////////////////////////////////////////////////////////
520
521
522 template<typename TStringSet, typename TCargo, typename TSpec, typename TSize>
523 inline void
consensusAlignment(Graph<Alignment<TStringSet,TCargo,TSpec>> & gOut,String<Pair<TSize,TSize>> & begEndPos)524 consensusAlignment(Graph<Alignment<TStringSet, TCargo, TSpec> >& gOut,
525 String<Pair<TSize, TSize> >& begEndPos)
526 {
527 ConsensusOptions consOpt;
528 consensusAlignment(gOut, begEndPos, consOpt);
529 }
530
531
532 //////////////////////////////////////////////////////////////////////////////
533
534 template <typename TFragSpec, typename TConfig, typename TStringSet, typename TCargo, typename TSpec, typename TContigId>
535 inline void
updateContig(FragmentStore<TFragSpec,TConfig> & fragStore,Graph<Alignment<TStringSet,TCargo,TSpec>> const & g,TContigId contigId)536 updateContig(FragmentStore<TFragSpec, TConfig>& fragStore,
537 Graph<Alignment<TStringSet, TCargo, TSpec> > const& g,
538 TContigId contigId)
539 {
540 SEQAN_CHECKPOINT
541 typedef Graph<Alignment<TStringSet, TCargo, TSpec> > TGraph;
542 typedef typename Size<TGraph>::Type TSize;
543 typedef typename Value<TStringSet>::Type TString;
544 typedef typename VertexDescriptor<TGraph>::Type TVertexDescriptor;
545 typedef std::map<TSize, TSize> TComponentLength;
546 typedef char TValue;
547
548 // Initialization
549 TStringSet& strSet = stringSet(g);
550 TSize nseq = length(strSet);
551 TValue gapChar = gapValue<TValue>();
552 TValue specialGap = '.';
553 TSize maxCoverage = 0;
554 TSize len = 0;
555 String<TValue> mat;
556
557 // Store for each read the begin position, the end position and the row in the alignment matrix
558 String<TSize> readBegEndRowPos;
559 resize(readBegEndRowPos, 3*nseq);
560
561 // Strongly Connected Components, topological sort, and length of each component
562 String<TSize> component;
563 String<TSize> order;
564 TComponentLength compLength;
565 if (convertAlignment(g, component, order, compLength)) {
566 TSize numOfComponents = length(order);
567
568 // Assign to each sequence the start and end (in terms of component ranks)
569 typedef String<std::pair<TSize, TSize> > TComponentToRank;
570 TComponentToRank compToRank;
571 for(TSize compIndex = 0; compIndex < numOfComponents; ++compIndex)
572 appendValue(compToRank, std::make_pair(order[compIndex], compIndex), Generous());
573 std::sort(begin(compToRank, Standard()), end(compToRank, Standard()));
574
575 typedef Pair<TSize, TSize> TRankPair;
576 typedef String<TRankPair> TSequenceToRanks;
577 TSequenceToRanks seqToRank;
578 resize(seqToRank, nseq);
579 typedef typename Iterator<TGraph, VertexIterator>::Type TVertexIterator;
580 TVertexIterator itVertex(g);
581 for(;!atEnd(itVertex);++itVertex) {
582 TVertexDescriptor vert = value(itVertex);
583 TSize seq = idToPosition(strSet, sequenceId(g, vert));
584 if (fragmentBegin(g, vert) == 0)
585 seqToRank[seq].i1 = std::lower_bound(begin(compToRank, Standard()), end(compToRank, Standard()), std::make_pair((TSize) component[vert], (TSize) 0))->second;
586 if (fragmentBegin(g, vert) + fragmentLength(g, vert) == length(strSet[seq]))
587 seqToRank[seq].i2 = std::lower_bound(begin(compToRank, Standard()), end(compToRank, Standard()), std::make_pair((TSize) component[vert], (TSize) 0))->second;
588 }
589 clear(compToRank);
590
591 // Assign the sequences to rows
592 String<TSize> seqToRow;
593 resize(seqToRow, nseq);
594 maxCoverage = 0;
595 typedef String<bool> TLeftOver;
596 typedef typename Iterator<TLeftOver, Standard>::Type TLeftOverIter;
597 TLeftOver leftOver;
598 resize(leftOver, nseq, true);
599 typedef String<std::pair<TSize, TSize> > TSeqToBegin;
600 typedef typename Iterator<TSeqToBegin, Standard>::Type TSeqToBeginIter;
601 TSeqToBegin seqToBegin;
602 TSize finishedSeq = 0;
603 while(finishedSeq < nseq) {
604 TLeftOverIter itL = begin(leftOver, Standard());
605 TLeftOverIter itLEnd = end(leftOver, Standard());
606 for(TSize pos = 0; itL != itLEnd; ++itL, ++pos)
607 if (*itL) appendValue(seqToBegin, std::make_pair((seqToRank[pos]).i1, pos), Generous());
608 std::sort(begin(seqToBegin, Standard()), end(seqToBegin, Standard()));
609
610 TSize endPos = 0;
611 TSeqToBeginIter itSB = begin(seqToBegin, Standard());
612 TSeqToBeginIter itSBEnd = end(seqToBegin, Standard());
613 for(;itSB != itSBEnd;++itSB) {
614 if (endPos <= (*itSB).first) {
615 TSize currentSeq = (*itSB).second;
616 seqToRow[currentSeq] = maxCoverage;
617 endPos = (seqToRank[currentSeq]).i2 + 2;
618 leftOver[currentSeq] = false;
619 ++finishedSeq;
620 }
621 }
622 clear(seqToBegin);
623 ++maxCoverage;
624 }
625 clear(leftOver);
626
627 // Create the matrix
628 len = 0;
629 String<TSize> compOffset;
630 resize(compOffset, numOfComponents);
631 for(TSize compIndex = 0; compIndex < numOfComponents; ++compIndex) {
632 compOffset[order[compIndex]] = len;
633 len+=compLength[order[compIndex]];
634 }
635 resize(mat, len * maxCoverage, gapChar);
636
637 // Fill in the segments
638 typedef typename Infix<TString>::Type TInfix;
639 typedef typename Iterator<TInfix, Standard>::Type TInfixIter;
640 typedef typename TGraph::TPosToVertexMap_ TPosToVertexMap;
641 for(typename TPosToVertexMap::const_iterator it = g.data_pvMap.begin();it != g.data_pvMap.end(); ++it) {
642 TInfix str = label(g,it->second);
643 TSize c = property(component, it->second);
644 TSize row = seqToRow[idToPosition(strSet, it->first.first)];
645 //if (row == 0) {
646 // std::cout << sequenceId(g, it->second) << ':' << str << ',' << strSet[sequenceId(g, it->second)] << std::endl;
647 // std::cout << getProperty(component, it->second) << ',' << order[compIndex] << std::endl;
648 // std::cout << (seqToRank[sequenceId(g, it->second)]).i1 << ',' << (seqToRank[sequenceId(g, it->second)]).i2 << std::endl;
649 //}
650 TInfixIter sIt = begin(str, Standard());
651 TInfixIter sItEnd = end(str, Standard());
652 TSize i = compOffset[c];
653 for(TSize pCol = i;sIt!=sItEnd;++sIt, ++pCol, ++i)
654 mat[row * len + pCol] = *sIt;
655 }
656 String<bool> active;
657 for(TSize compIndex = 0; compIndex < numOfComponents; ++compIndex) {
658 TSize offset = compOffset[order[compIndex]];
659 TSize currentCompLength = compLength[order[compIndex]];
660
661 clear(active);
662 resize(active, maxCoverage, false);
663
664 // Find the empty rows
665 for(TSize i=0;i<nseq; ++i) {
666 if (((seqToRank[i]).i1 <= compIndex) && ((seqToRank[i]).i2 >= compIndex))
667 active[(seqToRow[i])] = true;
668 }
669
670 // Substitute false gaps with special gap character
671 for(TSize i = 0; i < maxCoverage; ++i) {
672 if (!(active[i])) {
673 for(TSize pCol = offset;pCol < offset + currentCompLength;++pCol)
674 mat[i * len + pCol] = specialGap;
675 }
676 }
677 }
678
679 // Get the new begin and end positions
680 for(TSize i=0;i<nseq; ++i) {
681 TVertexDescriptor lastVertex = findVertex(const_cast<TGraph&>(g), positionToId(strSet, i), length(strSet[i]) - 1);
682 TSize readBegin = compOffset[getProperty(component, findVertex(const_cast<TGraph&>(g), positionToId(strSet, i), 0))];
683 TSize readEnd = compOffset[getProperty(component, lastVertex)] + fragmentLength(const_cast<TGraph&>(g), lastVertex);
684 readBegEndRowPos[3*i] = readBegin;
685 readBegEndRowPos[3*i+1] = readEnd;
686 readBegEndRowPos[3*i+2] = seqToRow[i];
687 }
688
689 }
690 clear(component);
691 clear(order);
692 compLength.clear();
693
694
695 //// Debug code
696 //for(TSize row = 0; row<maxCoverage; ++row) {
697 // for(TSize col = 0; col<len; ++col) {
698 // std::cout << mat[row * len + col];
699 // }
700 // std::cout << std::endl;
701 //}
702
703 // Create the new consensus
704 typedef typename Value<TString>::Type TAlphabet;
705 String<TValue> gappedCons;
706 consensusCalling(mat, gappedCons, maxCoverage, TAlphabet(), MajorityVote());
707
708 // Assign new consensus
709 assignGappedConsensus(fragStore, gappedCons, contigId);
710
711 // Update all aligned reads
712 typedef FragmentStore<TSpec, TConfig> TFragmentStore;
713 typedef typename TFragmentStore::TReadPos TReadPos;
714 typedef typename TFragmentStore::TContigPos TContigPos;
715 typedef typename TFragmentStore::TContigGapAnchor TContigGapAnchor;
716 //typedef typename Value<typename TFragmentStore::TAlignedReadStore>::Type TAlignedElement;
717 typedef typename Iterator<typename TFragmentStore::TAlignedReadStore>::Type TAlignIter;
718 sortAlignedReads(fragStore.alignedReadStore, SortContigId());
719 TAlignIter alignIt = lowerBoundAlignedReads(fragStore.alignedReadStore, contigId, SortContigId());
720 TAlignIter alignItEnd = upperBoundAlignedReads(fragStore.alignedReadStore, contigId, SortContigId());
721 TReadPos ungappedPos = 0;
722 TReadPos gappedPos = 0;
723 bool gapOpen;
724 for(TSize i = 0;alignIt != alignItEnd; ++alignIt, ++i) {
725 TSize lenRead = length(fragStore.readSeqStore[alignIt->readId]);
726 TReadPos begClr = 0;
727 TReadPos endClr = 0;
728 getClrRange(fragStore, *alignIt, begClr, endClr);
729 clear(alignIt->gaps);
730 ungappedPos = begClr;
731 if (alignIt->beginPos > alignIt->endPos) ungappedPos = lenRead - endClr;
732 if (ungappedPos != 0) appendValue(alignIt->gaps, TContigGapAnchor(ungappedPos, 0));
733 gappedPos = 0;
734 gapOpen = false;
735 for(TSize column = readBegEndRowPos[3*i]; column<readBegEndRowPos[3*i + 1]; ++column, ++gappedPos) {
736 if (mat[readBegEndRowPos[3*i + 2] * len + column] == gapChar) gapOpen = true;
737 else {
738 if (gapOpen) {
739 appendValue(alignIt->gaps, TContigGapAnchor(ungappedPos, gappedPos), Generous());
740 gapOpen = false;
741 }
742 ++ungappedPos;
743 }
744 }
745 if (gapOpen) appendValue(alignIt->gaps, TContigGapAnchor(ungappedPos, gappedPos), Generous());
746 if (alignIt->beginPos < alignIt->endPos) {
747 if (endClr != (TContigPos)lenRead)
748 appendValue(alignIt->gaps, TContigGapAnchor(lenRead, lenRead + (gappedPos - ungappedPos) - (lenRead - endClr)), Generous());
749 } else {
750 if (begClr != 0)
751 appendValue(alignIt->gaps, TContigGapAnchor(lenRead, lenRead + (gappedPos - ungappedPos) - begClr), Generous());
752 }
753
754 // Set new begin and end position
755 if (alignIt->beginPos < alignIt->endPos) {
756 alignIt->beginPos = readBegEndRowPos[3*i];
757 alignIt->endPos = readBegEndRowPos[3*i+1];
758 } else {
759 alignIt->beginPos = readBegEndRowPos[3*i+1];
760 alignIt->endPos = readBegEndRowPos[3*i];
761 }
762 }
763 }
764
765
766 //////////////////////////////////////////////////////////////////////////////
767
768 template <typename TValue, typename TSpec, typename TCounters, typename TSize, typename TAlphabet>
769 inline void
_countLetters(String<TValue,TSpec> const & mat,TCounters & counterValues,TSize alignDepth,TAlphabet)770 _countLetters(String<TValue, TSpec> const & mat,
771 TCounters & counterValues,
772 TSize alignDepth,
773 TAlphabet)
774 {
775 typedef String<TValue, TSpec> const TMatrix;
776 typedef typename Iterator<TMatrix, Standard>::Type TMatIter;
777
778 // Initialization
779 TSize len = length(mat) / alignDepth;
780 TValue gapChar = gapValue<TValue>();
781 TValue specialGap = '.';
782 TSize alphabetSize = ValueSize<TAlphabet>::VALUE;
783
784 // Set-up counter values
785 typedef typename Value<TCounters>::Type TCounter;
786 typedef typename Iterator<TCounters, Standard>::Type TCounterIt;
787 resize(counterValues, len);
788 for(TSize i=0;i<len; ++i) {
789 TCounter counter;
790 resize(counter, alphabetSize + 1, 0);
791 counterValues[i] = counter;
792 }
793
794 // Count all
795 TMatIter matIt = begin(mat, Standard());
796 TMatIter matItEnd = end(mat, Standard());
797 TCounterIt countIt = begin(counterValues, Standard());
798 TSize pos = 0;
799 for(; matIt != matItEnd; ++matIt, ++countIt, ++pos) {
800 if (pos % len == 0) countIt = begin(counterValues, Standard());
801 if (*matIt != specialGap) {
802 if (*matIt == gapChar) ++value(*countIt, alphabetSize);
803 else ++value(*countIt, ordValue((TAlphabet) *matIt));
804 }
805 }
806 }
807
808
809 //////////////////////////////////////////////////////////////////////////////
810
811 template <typename TValue, typename TSpec, typename TGappedCons, typename TAlignDepth, typename TAlphabet>
812 inline void
consensusCalling(String<TValue,TSpec> const & mat,TGappedCons & gappedConsensus,TAlignDepth maxCoverage,TAlphabet,Bayesian)813 consensusCalling(String<TValue, TSpec> const& mat,
814 TGappedCons& gappedConsensus,
815 TAlignDepth maxCoverage,
816 TAlphabet,
817 Bayesian)
818 {
819 typedef double TProbability;
820 typedef String<TProbability> TProbabilityDistribution;
821 typedef String<TProbabilityDistribution> TPositionalPrDist;
822 typedef typename Iterator<TPositionalPrDist, Standard>::Type TPosPrDistIter;
823
824 typedef typename Size<String<TValue, TSpec> >::Type TSize;
825 TSize alphabetSize = ValueSize<TAlphabet>::VALUE;
826 TValue gapChar = gapValue<TValue>();
827 TValue specialGap = '.';
828
829 // Set-up the counters
830 typedef String<TSize> TCounter;
831 typedef String<TCounter> TCounters;
832 TCounters counterValues;
833 _countLetters(mat, counterValues, maxCoverage, TAlphabet() );
834
835
836 // Initialization
837 TSize len = length(mat) / maxCoverage;
838 TProbabilityDistribution backroundDist;
839 resize(backroundDist, alphabetSize + 1, ((TProbability) 1 / (TProbability) (alphabetSize + 1)));
840
841 // Get an initial consensus
842 typedef typename Iterator<TCounters, Standard>::Type TCounterIt;
843 TCounterIt countIt = begin(counterValues, Standard());
844 TCounterIt countItEnd = end(counterValues, Standard());
845 TPositionalPrDist posPrDist;
846 TValue c = TAlphabet();
847 for(;countIt != countItEnd; ++countIt) {
848 TSize max = 0;
849 typedef typename Iterator<TCounter, Standard>::Type TCIt;
850 TCIt cIt = begin(*countIt, Standard());
851 TCIt cItEnd = end(*countIt, Standard());
852 TSize pos = 0;
853 for(;cIt != cItEnd; ++cIt, ++pos) {
854 if (*cIt > max) {
855 max = *cIt;
856 c = (pos == alphabetSize) ? gapChar : (TValue) TAlphabet(pos);
857 }
858 }
859 TProbabilityDistribution prDist;
860 resize(prDist, alphabetSize + 1, 0);
861 if (c == gapChar) prDist[alphabetSize] = 1;
862 else prDist[ordValue((TAlphabet) c)] = 1;
863 appendValue(posPrDist, prDist, Generous());
864 }
865
866 TSize run = 1;
867 TProbabilityDistribution pI;
868 TProbabilityDistribution pIJ;
869 TProbabilityDistribution pIOld;
870 TProbabilityDistribution pIJOld;
871 while (run) {
872 // Store the values from the last iteration
873 pIOld = pI;
874 pIJOld = pIJ;
875
876 // Count all letters in the consensus
877 TProbabilityDistribution nI;
878 resize(nI, alphabetSize + 1, 0);
879 TPosPrDistIter itPosPrDist = begin(posPrDist, Standard());
880 TPosPrDistIter itPosPrDistEnd = end(posPrDist, Standard());
881 for(;itPosPrDist!=itPosPrDistEnd; ++itPosPrDist)
882 for(TSize i = 0; i<(alphabetSize + 1); ++i)
883 nI[i] += (*itPosPrDist)[i];
884
885 // Composition probabilities
886 clear(pI);
887 resize(pI, alphabetSize + 1);
888 TProbability lenPosPrDist = (TProbability) length(posPrDist);
889 for(TSize i = 0; i<length(pI); ++i)
890 pI[i] = nI[i] / lenPosPrDist;
891
892
893 // Count all letters that agree / disagree with the consensus
894 TProbabilityDistribution nIJ;
895 resize(nIJ, (alphabetSize + 1) * (alphabetSize + 1), 0);
896 typedef String<TValue, TSpec> TMatrix;
897 typedef typename Iterator<TMatrix, Standard>::Type TMatIter;
898 TMatIter matIt = begin(mat, Standard());
899 TMatIter matItEnd = end(mat, Standard());
900 itPosPrDist = begin(posPrDist, Standard());
901 TSize pos = 0;
902 for(; matIt != matItEnd; ++matIt, ++itPosPrDist, ++pos) {
903 if (pos % len == 0) itPosPrDist = begin(posPrDist, Standard());
904 TValue c = *matIt;
905 if (c != specialGap) {
906 TSize fragJ = (c != gapChar) ? ordValue(TAlphabet(c)) : alphabetSize;
907 for(TSize consI = 0; consI<(alphabetSize + 1); ++consI)
908 nIJ[consI * (alphabetSize + 1) + fragJ] += (*itPosPrDist)[consI];
909 }
910 }
911
912 // Sequencing error probabilities
913 clear(pIJ);
914 resize(pIJ, (alphabetSize + 1) * (alphabetSize + 1));
915 TProbability sumIJ = 0;
916 for(TSize diag = 0; diag<(alphabetSize + 1); ++diag) sumIJ += nIJ[diag * (alphabetSize + 1) + diag];
917 for(TSize consI = 0; consI<(alphabetSize + 1); ++consI)
918 for(TSize fragJ = 0; fragJ<(alphabetSize + 1); ++fragJ)
919 pIJ[consI * (alphabetSize + 1) + fragJ] = nIJ[consI * (alphabetSize + 1) + fragJ] / sumIJ;
920
921 // Recompute positional probability distribution
922 itPosPrDist = begin(posPrDist, Standard());
923 TSize col = 0;
924 for(;itPosPrDist!=itPosPrDistEnd; ++itPosPrDist, ++col) {
925 TProbabilityDistribution prDist;
926 resize(prDist, alphabetSize + 1);
927 for(TSize consI = 0; consI<(alphabetSize + 1); ++consI) {
928 TProbability numerator = pI[consI];
929 TProbability denominator = 0;
930 for(TSize allI = 0; allI<(alphabetSize + 1); ++allI) {
931 TProbability denominatorSub = value(pI, allI);
932 for(TSize row = 0; row < maxCoverage; ++row) {
933 TValue c = mat[row * len + col];
934 if (c != specialGap) {
935 TSize fragJ = (c != gapChar) ? ordValue(TAlphabet(c)) : alphabetSize;
936 if (allI == consI)
937 numerator *= pIJ[allI * (alphabetSize + 1) + fragJ];
938 denominatorSub *= pIJ[allI * (alphabetSize + 1) + fragJ];
939 }
940 }
941 denominator += denominatorSub;
942 }
943 prDist[consI] = numerator / denominator;
944 }
945 *itPosPrDist = prDist;
946 }
947
948 // Check termination criterion
949 TProbability eps = 0.00001;
950 typedef typename Iterator<TProbabilityDistribution, Standard>::Type TProbIter;
951 TProbIter pIter = begin(pIOld, Standard());
952 TProbIter pIterCompare = begin(pI, Standard());
953 TProbIter pIterEnd = end(pIOld, Standard());
954 TSize runOld = run;
955 for(;pIter != pIterEnd; ++pIter, ++pIterCompare) {
956 if (*pIter > *pIterCompare) {
957 if (*pIter - *pIterCompare > eps) {
958 ++run;
959 break;
960 }
961 } else {
962 if (*pIterCompare - *pIter > eps) {
963 ++run;
964 break;
965 }
966 }
967 }
968 if (runOld == run) {
969 pIter = begin(pIJOld, Standard());
970 pIterCompare = begin(pIJ, Standard());
971 pIterEnd = end(pIJOld, Standard());
972 for(;pIter != pIterEnd; ++pIter, ++pIterCompare) {
973 if (*pIter > *pIterCompare) {
974 if (*pIter - *pIterCompare > eps) {
975 ++run;
976 break;
977 }
978 } else {
979 if (*pIterCompare - *pIter > eps) {
980 ++run;
981 break;
982 }
983 }
984 }
985 }
986
987 if (runOld == run) {
988 std::cout << "Iterations: " << run << std::endl;
989 run = 0;
990 }
991 }
992
993 // Compute the most likely consensus
994 TPosPrDistIter itPosPrDist = begin(posPrDist, Standard());
995 TPosPrDistIter itPosPrDistEnd = end(posPrDist, Standard());
996 clear(gappedConsensus);
997 for(;itPosPrDist!=itPosPrDistEnd; ++itPosPrDist) {
998 TProbability max = 0;
999 TSize ind = 0;
1000 for(TSize consI = 0; consI<(alphabetSize + 1); ++consI) {
1001 if ((*itPosPrDist)[consI] > max) {
1002 max = (*itPosPrDist)[consI];
1003 ind = consI;
1004 }
1005 }
1006 if (ind == alphabetSize) appendValue(gappedConsensus, gapChar);
1007 else appendValue(gappedConsensus, TAlphabet(ind));
1008 }
1009
1010 }
1011
1012 //////////////////////////////////////////////////////////////////////////////
1013
1014
1015 template <typename TFragSpec, typename TConfig, typename TContigId>
1016 inline void
consensusCalling(FragmentStore<TFragSpec,TConfig> & fragStore,TContigId contigId,Bayesian)1017 consensusCalling(FragmentStore<TFragSpec, TConfig>& fragStore,
1018 TContigId contigId,
1019 Bayesian)
1020 {
1021 SEQAN_CHECKPOINT
1022
1023 typedef FragmentStore<TFragSpec, TConfig> TFragmentStore;
1024 typedef typename Size<TFragmentStore>::Type TSize;
1025 typedef typename TFragmentStore::TReadSeq TReadSeq;
1026 typedef typename Value<TReadSeq>::Type TAlphabet;
1027 typedef char TValue;
1028
1029 // Convert the contig to an alignment matrix
1030 typedef String<TValue> TAlignMat;
1031 TAlignMat mat;
1032 TSize maxCoverage;
1033 convertAlignment(fragStore, mat, contigId, maxCoverage);
1034
1035 // Call the consensus
1036 String<TValue> gappedConsensus;
1037 consensusCalling(mat, gappedConsensus, maxCoverage, TAlphabet(), Bayesian());
1038
1039 // Assign the new consensus
1040 assignGappedConsensus(fragStore, gappedConsensus, contigId);
1041 }
1042
1043 //////////////////////////////////////////////////////////////////////////////
1044
1045 template <typename TValue, typename TSpec, typename TGappedCons, typename TAlignDepth, typename TAlphabet>
1046 inline void
consensusCalling(String<TValue,TSpec> const & mat,TGappedCons & gappedConsensus,TAlignDepth maxCoverage,TAlphabet,MajorityVote)1047 consensusCalling(String<TValue, TSpec> const& mat,
1048 TGappedCons& gappedConsensus,
1049 TAlignDepth maxCoverage,
1050 TAlphabet,
1051 MajorityVote)
1052 {
1053 typedef typename Size<String<TValue, TSpec> >::Type TSize;
1054 TSize alphabetSize = ValueSize<TAlphabet>::VALUE;
1055 TValue gapChar = gapValue<TValue>();
1056
1057 // Set-up the counters
1058 typedef String<TSize> TCounter;
1059 typedef String<TCounter> TCounters;
1060 TCounters counterValues;
1061 _countLetters(mat, counterValues, maxCoverage, TAlphabet() );
1062
1063 // Get the consensus
1064 typedef typename Iterator<TCounters, Standard>::Type TCounterIt;
1065 TCounterIt countIt = begin(counterValues, Standard());
1066 TCounterIt countItEnd = end(counterValues, Standard());
1067 clear(gappedConsensus);
1068 TSize max = 0;
1069 TValue c = TValue();
1070 TSize pos = 0;
1071 for(;countIt != countItEnd; ++countIt) {
1072 max = 0;
1073 typedef typename Iterator<TCounter, Standard>::Type TCIt;
1074 TCIt cIt = begin(*countIt, Standard());
1075 TCIt cItEnd = end(*countIt, Standard());
1076 pos = 0;
1077 for(;cIt != cItEnd; ++cIt, ++pos) {
1078 if (*cIt > max) {
1079 max = *cIt;
1080 c = (pos == alphabetSize) ? gapChar : (TValue) TAlphabet(pos);
1081 }
1082 }
1083 appendValue(gappedConsensus, c);
1084 }
1085 }
1086
1087
1088 //////////////////////////////////////////////////////////////////////////////
1089
1090
1091 template <typename TFragSpec, typename TConfig, typename TContigId>
1092 inline void
consensusCalling(FragmentStore<TFragSpec,TConfig> & fragStore,TContigId contigId,MajorityVote)1093 consensusCalling(FragmentStore<TFragSpec, TConfig>& fragStore,
1094 TContigId contigId,
1095 MajorityVote)
1096 {
1097 typedef FragmentStore<TFragSpec, TConfig> TFragmentStore;
1098 typedef typename Size<TFragmentStore>::Type TSize;
1099 typedef typename TFragmentStore::TReadSeq TReadSeq;
1100 typedef typename Value<TReadSeq>::Type TAlphabet;
1101 typedef char TValue;
1102
1103 // Convert the contig to an alignment matrix
1104 typedef String<TValue> TAlignMat;
1105 TAlignMat mat;
1106 TSize maxCoverage;
1107 convertAlignment(fragStore, mat, contigId, maxCoverage);
1108
1109 // Call the consensus
1110 String<TValue> gappedConsensus;
1111 consensusCalling(mat, gappedConsensus, maxCoverage, TAlphabet(), MajorityVote());
1112
1113 // Assign the new consensus
1114 assignGappedConsensus(fragStore, gappedConsensus, contigId);
1115 }
1116
1117
1118 //////////////////////////////////////////////////////////////////////////////
1119
1120
1121 //////////////////////////////////////////////////////////////////////////////
1122 // Old proprietary FastaReadFormat
1123 //////////////////////////////////////////////////////////////////////////////
1124
1125 //////////////////////////////////////////////////////////////////////////////
1126
1127 struct FastaReadFormat_;
1128 typedef Tag<FastaReadFormat_> const FastaReadFormat;
1129
1130 //////////////////////////////////////////////////////////////////////////////
1131
1132 template<typename TFile, typename TSpec, typename TConfig>
1133 inline void
write(TFile & file,FragmentStore<TSpec,TConfig> & fragStore,FastaReadFormat)1134 write(
1135 TFile & file,
1136 FragmentStore<TSpec, TConfig>& fragStore,
1137 FastaReadFormat)
1138 {
1139 //IOREV _nodoc_ what has this got to do with consensus? why is it here?
1140 // Basic types
1141 typedef FragmentStore<TSpec, TConfig> TFragmentStore;
1142 typedef typename Size<TFragmentStore>::Type TSize;
1143 typedef typename TFragmentStore::TContigPos TContigPos;
1144 //typedef typename Value<TFile>::Type TValue;
1145 typedef char TMultiReadChar;
1146 TMultiReadChar gapChar = gapValue<TMultiReadChar>();
1147
1148 typename DirectionIterator<TFile, Output>::Type target = directionIterator(file, Output());
1149
1150 typedef typename Iterator<typename TFragmentStore::TContigStore, Standard>::Type TContigIter;
1151 TContigIter contigIt = begin(fragStore.contigStore, Standard() );
1152 TContigIter contigItEnd = end(fragStore.contigStore, Standard() );
1153 for(TSize idCount = 0;contigIt != contigItEnd; ++contigIt, ++idCount) {
1154 // Alignment matrix
1155 typedef String<TMultiReadChar> TAlignMat;
1156 TAlignMat mat;
1157 TSize maxCoverage;
1158 String<TSize> readSlot;
1159 convertAlignment(fragStore, mat, idCount, maxCoverage, readSlot);
1160 TSize len = length(mat) / maxCoverage;
1161
1162 // Gapped consensus sequence
1163 typedef String<TMultiReadChar> TGappedConsensus;
1164 TGappedConsensus gappedConsensus;
1165 getGappedConsensus(fragStore, gappedConsensus, idCount);
1166
1167 // Print the alignment matrix
1168 String<TSize> coverage;
1169 resize(coverage, len, 0);
1170 typedef typename Iterator<TGappedConsensus, Standard>::Type TConsIter;
1171 TConsIter itCons = begin(gappedConsensus, Standard());
1172 TSize winSize = 60;
1173 int offset = 2;
1174 TSize column = 0;
1175 while (column<len) {
1176 TSize window_end = column + winSize;
1177 if (window_end >= len) window_end = len;
1178 // Position
1179 for(int i = 0; i<offset - 2; ++i) writeValue(target, ' ');
1180 write(target, "Pos: ");
1181 appendNumber(target, column);
1182 writeValue(target, '\n');
1183 // Ruler
1184 for(int i = 0; i<offset + 3; ++i) writeValue(target, ' ');
1185 for(TSize local_col = 1; local_col<window_end - column + 1; ++local_col) {
1186 if ((local_col % 10)==0)
1187 writeValue(target, ':');
1188 else if ((local_col % 5)==0)
1189 writeValue(target, '.');
1190 else
1191 writeValue(target, ' ');
1192 }
1193 writeValue(target, '\n');
1194 // Matrix
1195 for(TSize row = 0; row<maxCoverage; ++row) {
1196 TSize tmp = row;
1197 int off = 0;
1198 while (tmp / 10 != 0) {
1199 tmp /= 10;
1200 ++off;
1201 }
1202 for(int i = 0; i<offset - off; ++i) writeValue(target, ' ');
1203 appendNumber(target, row);
1204 writeValue(target, ':');
1205 writeValue(target, ' ');
1206 for(TSize local_col = column; local_col<window_end; ++local_col) {
1207 writeValue(target, mat[row * len + local_col]);
1208 if (mat[row * len + local_col] != '.') ++coverage[local_col];
1209 }
1210 writeValue(target, '\n');
1211 }
1212 writeValue(target, '\n');
1213
1214 // Consensus
1215 for(int i = 0; i<offset; ++i)
1216 writeValue(target, ' ');
1217 write(target, "C: ");
1218 for(unsigned int local_col = column; local_col<window_end; ++local_col, ++itCons)
1219 writeValue(target, *itCons);
1220 writeValue(target, '\n');
1221 for(int i = 0; i<offset-1; ++i)
1222 writeValue(target, ' ');
1223 write(target, ">2: ");
1224 for(unsigned int local_col = column; local_col<window_end; ++local_col) {
1225 if (coverage[local_col] > 2)
1226 writeValue(target, gappedConsensus[local_col]);
1227 else
1228 writeValue(target, gapChar);
1229 }
1230 writeValue(target, '\n');
1231 writeValue(target, '\n');
1232 column+=winSize;
1233 }
1234 writeValue(target, '\n');
1235 writeValue(target, '\n');
1236
1237 // Print all aligned reads belonging to this contig
1238
1239 // Sort according to contigId
1240 sortAlignedReads(fragStore.alignedReadStore, SortContigId());
1241
1242 // Find range of the given contig
1243 typedef typename Iterator<typename TFragmentStore::TAlignedReadStore, Standard>::Type TAlignIter;
1244 TAlignIter alignIt = lowerBoundAlignedReads(fragStore.alignedReadStore, idCount, SortContigId());
1245 TAlignIter alignItEnd = upperBoundAlignedReads(fragStore.alignedReadStore, idCount, SortContigId());
1246
1247 // Sort the reads according to the begin position
1248 sortAlignedReads(infix(fragStore.alignedReadStore, alignIt - begin(fragStore.alignedReadStore, Standard()), alignItEnd - begin(fragStore.alignedReadStore, Standard())), SortBeginPos());
1249 TAlignIter alignItTmp = lowerBoundAlignedReads(fragStore.alignedReadStore, idCount, SortContigId());
1250 TAlignIter alignItTmpEnd = upperBoundAlignedReads(fragStore.alignedReadStore, idCount, SortContigId());
1251 String<std::pair<TSize, TSize> > idToPos;
1252 reserve(idToPos, alignItTmpEnd - alignItTmp);
1253 for(TSize iCount = 0; alignItTmp!=alignItTmpEnd; ++iCount, ++alignItTmp)
1254 appendValue(idToPos, std::make_pair(alignItTmp->id, readSlot[iCount]));
1255 std::sort(begin(idToPos, Standard()), end(idToPos, Standard()));
1256
1257 // Sort the reads according to the id
1258 sortAlignedReads(infix(fragStore.alignedReadStore, alignIt - begin(fragStore.alignedReadStore, Standard()), alignItEnd - begin(fragStore.alignedReadStore, Standard())), SortId());
1259 alignIt = lowerBoundAlignedReads(fragStore.alignedReadStore, idCount, SortContigId());
1260 alignItEnd = upperBoundAlignedReads(fragStore.alignedReadStore, idCount, SortContigId());
1261
1262 bool noNamesPresent = (length(fragStore.readNameStore) == 0);
1263 for(TSize iCount = 0;alignIt != alignItEnd; ++alignIt, ++iCount) {
1264
1265 // Print all reads
1266 write(target, "typ:");
1267 if (!noNamesPresent)
1268 {
1269 writeValue(target, 'R');
1270 appendNumber(target, iCount);
1271 }
1272 else
1273 {
1274 write(target, fragStore.readNameStore[alignIt->readId]);
1275 }
1276 write(target, "\nseq:");
1277 write(target, fragStore.readSeqStore[alignIt->readId]);
1278 write(target, "\nPos:");
1279 appendNumber(target, alignIt->beginPos);
1280 writeValue(target, ',');
1281 appendNumber(target, alignIt->endPos);
1282 writeValue(target, '\n');
1283 #ifndef CELERA_OFFSET
1284 TSize begClr = 0;
1285 TSize endClr = 0;
1286 getClrRange(fragStore, *alignIt, begClr, endClr);
1287 write(target, "clr:");
1288 appendNumber(target, begClr);
1289 writeValue(target, ',');
1290 appendNumber(target, endClr);
1291 writeValue(target, '\n');
1292 #endif
1293 std::stringstream gapCoords;
1294 TSize letterCount = 0;
1295 TSize gapCount = 0;
1296 for(TContigPos column = _min(alignIt->beginPos, alignIt->endPos); column < _max(alignIt->beginPos, alignIt->endPos); ++column) {
1297 if (mat[idToPos[iCount].second * len + column] == gapChar) {
1298 ++gapCount;
1299 gapCoords << letterCount << ' ';
1300 } else ++letterCount;
1301 }
1302 write(target, "dln:");
1303 appendNumber(target, gapCount);
1304 writeValue(target, '\n');
1305 write(target, "del:");
1306 write(target, gapCoords.str());
1307 writeValue(target, '\n');
1308 writeValue(target, '\n');
1309 }
1310 }
1311 }
1312
1313 //////////////////////////////////////////////////////////////////////////////
1314 // Read simulator format: Simple fasta read file with positions
1315 //////////////////////////////////////////////////////////////////////////////
1316
1317
1318 //////////////////////////////////////////////////////////////////////////////////
1319
1320
1321 template<typename TFile, typename TSpec, typename TConfig, typename TFilePath>
1322 inline void
_convertSimpleReadFile(TFile &,FragmentStore<TSpec,TConfig> &,TFilePath &,bool)1323 _convertSimpleReadFile(TFile& /*file*/,
1324 FragmentStore<TSpec, TConfig>& /*fragStore*/,
1325 TFilePath& /*filePath*/,
1326 bool /*moveToFront*/)
1327 {
1328 return;
1329 ////IOREV _nodoc_ huge undocumented function, uses custom IO based on iostream and FILE* :S
1330 // // Basic types
1331 // typedef FragmentStore<TSpec, TConfig> TFragmentStore;
1332 // typedef typename Id<TFragmentStore>::Type TId;
1333 // typedef typename Size<TFragmentStore>::Type TSize;
1334 // //typedef typename Value<TFile>::Type TValue;
1335 // typedef typename TFragmentStore::TContigPos TPos;
1336 // typedef typename TFragmentStore::TReadSeq TReadSeq;
1337 //
1338 // // All fragment store element types
1339 // typedef typename Value<typename TFragmentStore::TContigStore>::Type TContigElement;
1340 // typedef typename Value<typename TFragmentStore::TLibraryStore>::Type TLibraryStoreElement;
1341 // typedef typename Value<typename TFragmentStore::TMatePairStore>::Type TMatePairElement;
1342 // typedef typename Value<typename TFragmentStore::TReadStore>::Type TReadStoreElement;
1343 // typedef typename Value<typename TFragmentStore::TAlignedReadStore>::Type TAlignedElement;
1344 //
1345 //
1346 // // All maps to mirror file ids to our internal ids
1347 // typedef std::map<TId, TId> TIdMap;
1348 // TIdMap libIdMap;
1349 // TIdMap frgIdMap;
1350 // TIdMap readIdMap;
1351 //
1352 // // Create RecordReader object.
1353 // typename DirectionIterator<TFile, Input>::Type reader = directionIterator(file, Input());
1354 //
1355 // // Parse the file and convert the internal ids
1356 // TPos maxPos = 0;
1357 // TPos minPos = MaxValue<TPos>::VALUE;
1358 // TId count = 0;
1359 // if (atEnd(reader))
1360 // return false;
1361 // CharString buffer;
1362 // while (!atEnd(reader))
1363 // {
1364 // // New read?
1365 // if (value(reader) == '>')
1366 // {
1367 // TAlignedElement alignEl;
1368 // TId id = count;
1369 // TId fragId = TReadStoreElement::INVALID_ID;
1370 // TId repeatId = 0;
1371 //
1372 // goNext(reader);
1373 // if (skipWhitespaces(reader) != 0)
1374 // return 1;
1375 //
1376 // // Get the layout positions
1377 // clear(buffer);
1378 // if (readDigits(buffer, reader) != 0)
1379 // return 1;
1380 // if (!lexicalCast2(alignEl.beginPos, buffer))
1381 // return 1;
1382 // goNext(reader);
1383 // if (skipWhitespaces(reader) != 0)
1384 // return 1;
1385 // clear(buffer);
1386 // if (readDigits(buffer, reader) != 0)
1387 // return 1;
1388 // if (!lexicalCast2(alignEl.endPos, buffer))
1389 // return 1;
1390 //
1391 // // Any attributes?
1392 // String<char> eid;
1393 // String<char> qlt;
1394 // TReadSeq seq;
1395 // if (value(reader) == '[')
1396 // {
1397 // String<char> fdIdentifier;
1398 // while (value(reader) != ']')
1399 // {
1400 // goNext(reader);
1401 // if (skipWhitespaces(reader) != 0)
1402 // return 1;
1403 // clear(fdIdentifier);
1404 // if (readAlphaNums(fdIdentifier, reader) != 0)
1405 // return 1;
1406 // goNext(reader); // Skip "="
1407 // if (fdIdentifier == "id")
1408 // {
1409 // clear(buffer);
1410 // if (readDigits(buffer, reader) != 0)
1411 // return 1;
1412 // if (!lexicalCast2(id, buffer))
1413 // return 1;
1414 // } else if (fdIdentifier == "fragId") {
1415 // clear(buffer);
1416 // if (readDigits(buffer, reader) != 0)
1417 // return 1;
1418 // if (!lexicalCast2(fragId, buffer))
1419 // return 1;
1420 // } else if (fdIdentifier == "repeatId") {
1421 // clear(buffer);
1422 // if (readDigits(buffer, reader) != 0)
1423 // return 1;
1424 // if (!lexicalCast2(repeatId, buffer))
1425 // return 1;
1426 // } else if (fdIdentifier == "eid") {
1427 // if (readUntilOneOf(eid, reader, ',', ']') != 0)
1428 // return 1;
1429 // } else if (fdIdentifier == "qlt") {
1430 // if (readUntilOneOf(qlt, reader, ',', ']') != 0)
1431 // return 1;
1432 // } else {
1433 // // Jump to next attribute
1434 // // TODO(holtgrew): Add skipUntilOneOf()?
1435 // if (readUntilOneOf(buffer, reader, ',', ']') != 0)
1436 // return 1;
1437 // }
1438 // }
1439 // }
1440 // if (skipLine(reader) != 0)
1441 // return 1;
1442 // if (skipWhitespaces(reader) != 0)
1443 // return 1;
1444 // while (!atEnd(reader) && value(reader) != '>')
1445 // {
1446 // if (readLetters(seq, reader) != 0)
1447 // return 1;
1448 // int res = skipWhitespaces(reader);
1449 // if (res != 0 && res != EOF_BEFORE_SUCCESS)
1450 // return 1;
1451 // }
1452 //
1453 // // Set quality
1454 // typedef typename Iterator<TReadSeq, Standard>::Type TReadIter;
1455 // typedef typename Iterator<String<char> >::Type TQualIter;
1456 // TReadIter begIt = begin(seq, Standard() );
1457 // TReadIter begItEnd = begin(seq, Standard() );
1458 // if (length(qlt)) {
1459 // TQualIter qualIt = begin(qlt);
1460 // TQualIter qualItEnd = end(qlt);
1461 // for(;qualIt != qualItEnd; goNext(qualIt), goNext(begIt)) assignQualityValue(value(begIt), value(qualIt));
1462 // } else {
1463 // for(;begIt != begItEnd; goNext(begIt)) assignQualityValue(value(begIt), 'D');
1464 // }
1465 //
1466 // // Set eid if not given
1467 // if (empty(eid)) {
1468 // std::stringstream input;
1469 // input << "R" << id;
1470 // input << "-" << repeatId;
1471 // eid = input.str().c_str();
1472 // }
1473 //
1474 // // Insert the read
1475 // readIdMap.insert(std::make_pair(id, static_cast<TId>(length(fragStore.readStore))));
1476 // appendRead(fragStore, seq, fragId);
1477 // appendValue(fragStore.readNameStore, eid, Generous());
1478 //
1479 // // Insert an aligned read
1480 // TSize readLen = length(seq);
1481 // if (alignEl.beginPos < alignEl.endPos) {
1482 // if ((TPos)readLen != alignEl.endPos - alignEl.beginPos) {
1483 // alignEl.endPos = alignEl.beginPos + readLen;
1484 // }
1485 // if (alignEl.beginPos < minPos) minPos = alignEl.beginPos;
1486 // if (alignEl.endPos > maxPos) maxPos = alignEl.endPos;
1487 // } else {
1488 // if ((TPos)readLen != alignEl.beginPos - alignEl.endPos) {
1489 // alignEl.beginPos = alignEl.endPos + readLen;
1490 // }
1491 // if (alignEl.endPos < minPos) minPos = alignEl.endPos;
1492 // if (alignEl.beginPos > maxPos) maxPos = alignEl.beginPos;
1493 // }
1494 // alignEl.readId = id;
1495 // alignEl.pairMatchId = fragId;
1496 // alignEl.contigId = 0;
1497 // alignEl.id = length(fragStore.alignedReadStore);
1498 // appendValue(fragStore.alignedReadStore, alignEl, Generous());
1499 // ++count;
1500 // } else {
1501 // if (skipLine(reader) != 0)
1502 // return 1;
1503 // }
1504 // }
1505 //
1506 // // Read contig or reference sequence
1507 // TContigElement contigEl;
1508 // std::string fileName = filePath + 'S';
1509 // SeqFileIn strmRef;
1510 // String<char> contigEid = "C0";
1511 // if (open(strmRef, fileName.c_str()))
1512 // {
1513 // typename DirectionIterator<TFile, Input>::Type readerRef = directionIterator(strmRef, Input());
1514 // clear(contigEid);
1515 // readRecord(contigEid, contigEl.seq, strmRef);
1516 // close(strmRef);
1517 // }
1518 // if (empty(contigEl.seq)) {
1519 // typedef typename TFragmentStore::TContigGapAnchor TContigGapAnchor;
1520 // if (moveToFront) appendValue(contigEl.gaps, TContigGapAnchor(0, maxPos - minPos), Generous());
1521 // else appendValue(contigEl.gaps, TContigGapAnchor(0, maxPos), Generous());
1522 // }
1523 // appendValue(fragStore.contigStore, contigEl, Generous());
1524 // appendValue(fragStore.contigNameStore, contigEid, Generous());
1525 //
1526 //
1527 // // Read fragments
1528 // fileName = filePath + 'F';
1529 // FILE* strmFrag = fopen(fileName.c_str(), "rb");
1530 // if (!strmFrag)
1531 // return 1;
1532 // RecordReader<FILE *, SinglePass<> > readerFrag(strmFrag);
1533 // while (!atEnd(readerFrag))
1534 // {
1535 // if (value(readerFrag) == '>')
1536 // {
1537 // TMatePairElement matePairEl;
1538 // goNext(readerFrag);
1539 // if (skipWhitespaces(readerFrag) != 0)
1540 // return 1;
1541 //
1542 // // Get the fragment id
1543 // clear(buffer);
1544 // if (readDigits(buffer, readerFrag) != 0)
1545 // return 1;
1546 // TId id = 0;
1547 // if (!lexicalCast2(id, buffer))
1548 // return 1;
1549 //
1550 // // Any attributes?
1551 // std::stringstream input;
1552 // input << "F" << id;
1553 // String<char> eid(input.str().c_str());
1554 // if (value(readerFrag) == '[')
1555 // {
1556 // String<char> fdIdentifier;
1557 // while (value(readerFrag) != ']')
1558 // {
1559 // goNext(readerFrag);
1560 // if (skipWhitespaces(readerFrag) != 0)
1561 // return 1;
1562 // clear(fdIdentifier);
1563 // if (readAlphaNums(fdIdentifier, readerFrag) != 0)
1564 // return 1;
1565 // goNext(readerFrag);
1566 // if (fdIdentifier == "libId")
1567 // {
1568 // clear(buffer);
1569 // if (readDigits(buffer, readerFrag) != 0)
1570 // return 1;
1571 // if (!lexicalCast2(matePairEl.libId, buffer))
1572 // return 1;
1573 // } else if (fdIdentifier == "eid") {
1574 // clear(eid);
1575 // if (readUntilOneOf(eid, readerFrag, ',', ']') != 0)
1576 // return 1;
1577 // } else {
1578 // // Jump to next attribute
1579 // if (readUntilOneOf(buffer, readerFrag, ',', ']') != 0)
1580 // return 1;
1581 // }
1582 // }
1583 // }
1584 // if (skipLine(readerFrag) != 0)
1585 // return 1;
1586 //
1587 // // Read the two reads belonging to this mate pair
1588 // clear(buffer);
1589 // for (int i = 0; i < 2; ++i)
1590 // {
1591 // if (skipWhitespaces(readerFrag) != 0)
1592 // return 1;
1593 // clear(buffer);
1594 // if (readDigits(buffer, readerFrag) != 0)
1595 // return 1;
1596 // if (!lexicalCast2(matePairEl.readId[i], buffer))
1597 // return 1;
1598 // if (!i) // Skip ','.
1599 // goNext(readerFrag);
1600 // }
1601 // int res = skipLine(readerFrag);
1602 // if (res != 0 && res != EOF_BEFORE_SUCCESS)
1603 // return 1;
1604 //
1605 // // Insert mate pair
1606 // if (matePairEl.readId[0] != matePairEl.readId[1]) {
1607 // frgIdMap.insert(std::make_pair(id, static_cast<TId>(length(fragStore.matePairStore))));
1608 // appendValue(fragStore.matePairStore, matePairEl, Generous());
1609 // appendValue(fragStore.matePairNameStore, eid, Generous());
1610 // }
1611 // } else {
1612 // int res = skipLine(readerFrag);
1613 // if (res != 0 && res != EOF_BEFORE_SUCCESS)
1614 // return 1;
1615 // }
1616 // }
1617 // fclose(strmFrag);
1618 //
1619 // // Read libraries
1620 // fileName = filePath + 'L';
1621 // FILE* strmLib = fopen(fileName.c_str(), "rb");
1622 // if (!strmLib)
1623 // return 1;
1624 // RecordReader<FILE *, SinglePass<> > readerLib(strmLib);
1625 // while (!atEnd(readerLib))
1626 // {
1627 // if (value(readerLib) == '>')
1628 // {
1629 // TLibraryStoreElement libEl;
1630 // goNext(readerLib);
1631 // if (skipWhitespaces(readerLib) != 0)
1632 // return 1;
1633 //
1634 // // Get the fragment id
1635 // clear(buffer);
1636 // if (readDigits(buffer, readerLib) != 0)
1637 // return 1;
1638 // TId id = 0;
1639 // if (!lexicalCast2(id, buffer))
1640 // return 1;
1641 //
1642 // // Any attributes?
1643 // std::stringstream input;
1644 // input << "L" << id;
1645 // String<char> eid(input.str().c_str());
1646 // if (value(readerLib) == '[')
1647 // {
1648 // String<char> fdIdentifier;
1649 // while (value(readerLib) != ']')
1650 // {
1651 // goNext(readerLib);
1652 // if (skipWhitespaces(readerLib) != 0)
1653 // return 1;
1654 // clear(fdIdentifier);
1655 // if (readAlphaNums(fdIdentifier, readerLib) != 0)
1656 // return 1;
1657 // if (fdIdentifier == "eid")
1658 // {
1659 // clear(eid);
1660 // if (readUntilOneOf(eid, readerLib, ',', ']') != 0)
1661 // return 1;
1662 // } else {
1663 // // Jump to next attribute
1664 // // TODO(holtgrew): skipUntilOneOf()?
1665 // if (readUntilOneOf(buffer, readerLib, ',', ']') != 0)
1666 // return 1;
1667 // }
1668 // }
1669 // }
1670 // if (skipLine(readerLib) != 0)
1671 // return 1;
1672 // if (skipWhitespaces(readerLib) != 0)
1673 // return 1;
1674 //
1675 // // Read the mean and standard deviation
1676 // clear(buffer);
1677 // if (readDigits(buffer, readerLib) != 0)
1678 // return 1;
1679 // if (!lexicalCast2(libEl.mean, buffer))
1680 // return 1;
1681 // if (skipWhitespaces(readerLib) != 0)
1682 // return 1;
1683 // goNext(readerLib);
1684 // clear(buffer);
1685 // if (readDigits(buffer, readerLib) != 0)
1686 // return 1;
1687 // if (!lexicalCast2(libEl.std, buffer))
1688 // return 1;
1689 // int res = skipLine(readerLib);
1690 // if (res != 0 && res != EOF_BEFORE_SUCCESS)
1691 // return 1;
1692 //
1693 // // Insert mate pair
1694 // libIdMap.insert(std::make_pair(id, static_cast<TId>(length(fragStore.libraryStore))));
1695 // appendValue(fragStore.libraryStore, libEl, Generous());
1696 // appendValue(fragStore.libraryNameStore, eid, Generous());
1697 // } else {
1698 // int res = skipLine(readerLib);
1699 // if (res != 0 && res != EOF_BEFORE_SUCCESS)
1700 // return 1;
1701 // }
1702 // }
1703 // fclose(strmLib);
1704 //
1705 // // Renumber all ids
1706 // typedef typename TIdMap::const_iterator TIdMapIter;
1707 // typedef typename Iterator<typename TFragmentStore::TMatePairStore>::Type TMateIter;
1708 // TMateIter mateIt = begin(fragStore.matePairStore);
1709 // TMateIter mateItEnd = end(fragStore.matePairStore);
1710 // for(;mateIt != mateItEnd; goNext(mateIt)) {
1711 // if (mateIt->libId != TMatePairElement::INVALID_ID) {
1712 // TIdMapIter libIdPos = libIdMap.find(mateIt->libId);
1713 // if (libIdPos != libIdMap.end()) mateIt->libId = libIdPos->second;
1714 // else mateIt->libId = TMatePairElement::INVALID_ID;
1715 // }
1716 // if (mateIt->readId[0] != TMatePairElement::INVALID_ID) {
1717 // TIdMapIter readIdPos = readIdMap.find(mateIt->readId[0]);
1718 // if (readIdPos != readIdMap.end()) mateIt->readId[0] = readIdPos->second;
1719 // else mateIt->readId[0] = TMatePairElement::INVALID_ID;
1720 // }
1721 // if (mateIt->readId[1]!= TMatePairElement::INVALID_ID) {
1722 // TIdMapIter readIdPos = readIdMap.find(mateIt->readId[1]);
1723 // if (readIdPos != readIdMap.end()) mateIt->readId[1] = readIdPos->second;
1724 // else mateIt->readId[0] = TMatePairElement::INVALID_ID;
1725 // }
1726 // }
1727 // typedef typename Iterator<typename TFragmentStore::TReadStore>::Type TReadIter;
1728 // TReadIter readIt = begin(fragStore.readStore);
1729 // TReadIter readItEnd = end(fragStore.readStore);
1730 // for(;readIt != readItEnd; goNext(readIt)) {
1731 // if (readIt->matePairId != TReadStoreElement::INVALID_ID) {
1732 // TIdMapIter mateIdPos = frgIdMap.find(readIt->matePairId);
1733 // if (mateIdPos != frgIdMap.end()) readIt->matePairId = mateIdPos->second;
1734 // else readIt->matePairId = TReadStoreElement::INVALID_ID;
1735 // }
1736 // }
1737 // typedef typename Iterator<typename TFragmentStore::TAlignedReadStore>::Type TAlignIter;
1738 // TAlignIter alignIt = begin(fragStore.alignedReadStore);
1739 // TAlignIter alignItEnd = end(fragStore.alignedReadStore);
1740 // for(;alignIt != alignItEnd; goNext(alignIt)) {
1741 // if (alignIt->readId != TAlignedElement::INVALID_ID) {
1742 // TIdMapIter readIdPos = readIdMap.find(alignIt->readId);
1743 // if (readIdPos != readIdMap.end()) alignIt->readId = readIdPos->second;
1744 // else alignIt->readId = TAlignedElement::INVALID_ID;
1745 // }
1746 // if (moveToFront) {
1747 // alignIt->beginPos -= minPos;
1748 // alignIt->endPos -= minPos;
1749 // }
1750 // }
1751 // return 0;
1752 }
1753
1754
1755 //////////////////////////////////////////////////////////////////////////////
1756 // Rudimentary write functions for CeleraFrg and Celera Cgb
1757 //////////////////////////////////////////////////////////////////////////////
1758
1759 template<typename TFile, typename TSpec, typename TConfig>
1760 inline void
_writeCeleraFrg(TFile & file,FragmentStore<TSpec,TConfig> & fragStore)1761 _writeCeleraFrg(TFile& file,
1762 FragmentStore<TSpec, TConfig>& fragStore)
1763 {
1764 //IOREV _nodoc_
1765 typedef FragmentStore<TSpec, TConfig> TFragmentStore;
1766 typedef typename Size<TFragmentStore>::Type TSize;
1767 typedef typename TFragmentStore::TReadPos TReadPos;
1768
1769 typename DirectionIterator<TFile, Output>::Type target = directionIterator(file, Output());
1770
1771 // Iterate over all aligned reads to get the clear ranges
1772 typedef Pair<TReadPos, TReadPos> TClrRange;
1773 String<TClrRange> clearStr;
1774 resize(clearStr, length(fragStore.readStore));
1775 typedef typename Iterator<typename TFragmentStore::TAlignedReadStore>::Type TAlignIter;
1776 TAlignIter alignIt = begin(fragStore.alignedReadStore);
1777 TAlignIter alignItEnd = end(fragStore.alignedReadStore);
1778 for(;alignIt != alignItEnd; goNext(alignIt))
1779 {
1780 TReadPos begClr = 0;
1781 TReadPos endClr = 0;
1782 getClrRange(fragStore, value(alignIt), begClr, endClr);
1783 value(clearStr, alignIt->readId) = TClrRange(begClr, endClr);
1784 }
1785
1786 // Write Reads
1787 typedef typename Iterator<typename TFragmentStore::TReadStore>::Type TReadIter;
1788 TReadIter readIt = begin(fragStore.readStore);
1789 TReadIter readItEnd = end(fragStore.readStore);
1790 bool noNamesPresent = (length(fragStore.readNameStore) == 0);
1791 for (TSize idCount = 0; readIt != readItEnd; goNext(readIt), ++idCount)
1792 {
1793 write(target, "{FRG\nact:A\nacc:");
1794 appendNumber(target, idCount + 1);
1795 write(target, "\ntyp:R\n");
1796 if (!noNamesPresent)
1797 {
1798 write(target, "src:\n");
1799 write(target, fragStore.readNameStore[idCount]);
1800 write(target, "\n.\n");
1801 }
1802 write(target, "etm:0\nseq:\n");
1803 writeWrappedString(target, fragStore.readSeqStore[idCount], 70);
1804 write(target, ".\nqlt:\n");
1805 typedef typename Value<typename TFragmentStore::TReadSeqStore>::Type TReadSeq;
1806 typedef QualityExtractor<typename Value<TReadSeq>::Type> TQualityExtractor;
1807 ModifiedString<TReadSeq const, ModView<TQualityExtractor> > quals(fragStore.readSeqStore[idCount]);
1808 writeWrappedString(target, quals, 70);
1809 // Note: Clear range does not have to be ordered, e.g. no indication for reverse complemented reads, this is happening in cgb records
1810 write(target, ".\nclr:");
1811 appendNumber(target, clearStr[idCount].i1);
1812 writeValue(target, ',');
1813 appendNumber(target, clearStr[idCount].i2);
1814 write(target, "\n}\n");
1815 }
1816 }
1817
1818
1819 //////////////////////////////////////////////////////////////////////////////
1820
1821 // Writes out the first contig only.
1822
1823 // TODO(holtgrew): This is only used in seqcons, should it go into app?
1824 template<typename TFile, typename TSpec, typename TConfig>
1825 inline void
_writeCeleraCgb(TFile & file,FragmentStore<TSpec,TConfig> & fragStore)1826 _writeCeleraCgb(TFile& file,
1827 FragmentStore<TSpec, TConfig>& fragStore)
1828 {
1829 typedef FragmentStore<TSpec, TConfig> TFragmentStore;
1830 typedef typename Size<TFragmentStore>::Type TSize;
1831 typedef typename Id<TFragmentStore>::Type TId;
1832 //typedef typename TFragmentStore::TReadPos TReadPos;
1833
1834 typename DirectionIterator<TFile, Output>::Type target = directionIterator(file, Output());
1835
1836 // Write the first contig
1837 TId contigId = 0;
1838
1839 // Sort the reads according to position
1840 sortAlignedReads(fragStore.alignedReadStore, SortBeginPos());
1841
1842 // Write Header
1843 write(target, "{IUM\nacc:0\nsrc:\ngen> @@ [0,0]\n.\ncov:0.000\nsta:X\nfur:X\nabp:0\nbbp:0\nlen:");
1844 appendNumber(target, length((value(fragStore.contigStore, contigId)).seq));
1845 write(target, "\ncns:\n.\nqlt:\n.\nfor:0\nnfr:");
1846 appendNumber(target, length(fragStore.readStore));
1847 writeValue(target, '\n');
1848
1849 // Write reads
1850 typedef typename Iterator<typename TFragmentStore::TAlignedReadStore>::Type TAlignIter;
1851 TAlignIter alignIt = begin(fragStore.alignedReadStore);
1852 TAlignIter alignItEnd = end(fragStore.alignedReadStore);
1853 TSize offsetLeft = _min(alignIt->beginPos, alignIt->endPos);
1854 for (;alignIt != alignItEnd; goNext(alignIt))
1855 {
1856 if (contigId != alignIt->contigId)
1857 continue;
1858 write(target, "{IMP\ntyp:R\nmid:");
1859 appendNumber(target, alignIt->readId + 1);
1860 write(target, "\ncon:0\npos:");
1861 appendNumber(target, alignIt->beginPos - offsetLeft);
1862 writeValue(target, ',');
1863 appendNumber(target, alignIt->endPos - offsetLeft);
1864 write(target, "\ndln:0\ndel:\n}\n");
1865 }
1866 write(target, "}\n");
1867 }
1868
1869 } // namespace seqan
1870
1871 #endif //#ifndef SEQAN_HEADER_...
1872