1 // ==========================================================================
2 // SeqAn - The Library for Sequence Analysis
3 // ==========================================================================
4 // Copyright (c) 2006-2018, Knut Reinert, FU Berlin
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are met:
9 //
10 // * Redistributions of source code must retain the above copyright
11 // notice, this list of conditions and the following disclaimer.
12 // * Redistributions in binary form must reproduce the above copyright
13 // notice, this list of conditions and the following disclaimer in the
14 // documentation and/or other materials provided with the distribution.
15 // * Neither the name of Knut Reinert or the FU Berlin nor the names of
16 // its contributors may be used to endorse or promote products derived
17 // from this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29 // DAMAGE.
30 //
31 // ==========================================================================
32 // Author: Tobias Rausch <rausch@embl.de>
33 // ==========================================================================
34
35 #ifndef SEQAN_HEADER_CONSENSUS_BASE_H
36 #define SEQAN_HEADER_CONSENSUS_BASE_H
37
38 namespace seqan
39 {
40
41
42 //////////////////////////////////////////////////////////////////////////////
43 // Segment Match Generation tag
44 //////////////////////////////////////////////////////////////////////////////
45
46 // TODO(holtgrew): This apparently belongs into graph_msa?
47
48 struct OverlapLibrary_;
49 typedef Tag<OverlapLibrary_> const OverlapLibrary;
50
51
52
53 //////////////////////////////////////////////////////////////////////////////
54 // Consensus tag
55 //////////////////////////////////////////////////////////////////////////////
56
57 /*
58 * @defgroup ConsensusCallingTags Consensus Calling Tags
59 * @brief Tags for consensus calling.
60 *
61 * @tag ConsensusCallingTags#MajorityVote
62 * @headerfile <seqan/consensus.h>
63 * @brief Consensus based on the most common character.
64 *
65 * @tag ConsensusCallingTags#Bayesian
66 * @headerfile <seqan/consensus.h>
67 * @brief Consensus based on a bayesian probability.
68 */
69
70
71 struct MajorityVote_;
72 typedef Tag<MajorityVote_> const MajorityVote;
73
74 struct Bayesian_;
75 typedef Tag<Bayesian_> const Bayesian;
76
77
78
79 //////////////////////////////////////////////////////////////////////////////
80 // Read alignment and Consensus Generation
81 //////////////////////////////////////////////////////////////////////////////
82
83
84 //////////////////////////////////////////////////////////////////////////////////
85
86 struct ConsensusOptions {
87 public:
88 // Method
89 // 0: graph-based multiple sequence alignment
90 // 1: realign
91 int method;
92
93 // ReAlign Method
94 // 0: Needleman-Wunsch
95 // 1: Gotoh
96 int rmethod;
97
98 // Bandwidth of overlap alignment
99 int bandwidth;
100
101 // Number of computed overlaps per read (at the beginning and end of a read)
102 int overlaps;
103
104 // Minimum match length of a computed overlap
105 int matchlength;
106
107 // Minimum quality (in percent identity) of a computed overlap
108 int quality;
109
110 // Window size, only relevant for insert sequencing
111 // If window == 0, no insert sequencing is assumed
112 int window;
113
114 // Output
115 // 0: seqan style
116 // 1: afg output format
117 // 2: frg output format
118 // 3: cgb output format
119 // 4: Sam output format
120 int output;
121
122 // Multi-read alignment
123 bool noalign;
124
125 // Offset all reads, so the first read starts at position 0
126 bool moveToFront;
127
128 // Include reference genome
129 bool include;
130
131 // Scoring object for overlap alignments
132 Score<int> sc;
133
134 // Various input and output files
135 std::string readsfile; // File of reads in FASTA format
136 std::string afgfile; // AMOS afg file input
137 std::string samfile; // Sam file input
138 std::string contigsfile; // FASTA reference file for Sam input
139 std::string outfile; // Output file name
140
141 // Initialization
ConsensusOptionsConsensusOptions142 ConsensusOptions() :
143 method(0), rmethod(0), bandwidth(0), overlaps(0), matchlength(0), quality(0),
144 window(0), output(0), noalign(false), moveToFront(0), include(0), sc(2, -6, -4, -9)
145 {}
146 };
147
148
149 //////////////////////////////////////////////////////////////////////////////////
150
151 // Copy reads for the whole contig out of fragStore and into strSet. The start and end
152 // positions of the alignments go into startEndPos.
153
154 template<typename TValue, typename TStrSpec, typename TPosPair, typename TStringSpec, typename TSpec, typename TConfig, typename TId>
155 inline void
_loadContigReads(StringSet<TValue,Owner<TStrSpec>> & strSet,String<TPosPair,TStringSpec> & startEndPos,FragmentStore<TSpec,TConfig> const & fragStore,TId const contigId)156 _loadContigReads(StringSet<TValue, Owner<TStrSpec> > & strSet,
157 String<TPosPair, TStringSpec> & startEndPos,
158 FragmentStore<TSpec, TConfig> const & fragStore,
159 TId const contigId)
160 {
161 typedef FragmentStore<TSpec, TConfig> const TFragmentStore;
162 typedef typename Size<TFragmentStore>::Type TSize;
163 typedef typename TFragmentStore::TReadPos TReadPos;
164
165
166 // Sort aligned reads according to contig id
167 sortAlignedReads(fragStore.alignedReadStore, SortContigId());
168 resize(strSet, length(fragStore.alignedReadStore));
169
170 // Retrieve all reads, limit them to the clear range and if required reverse complement them
171 typedef typename Iterator<typename TFragmentStore::TAlignedReadStore const>::Type TAlignIter;
172 TAlignIter alignIt = lowerBoundAlignedReads(fragStore.alignedReadStore, contigId, SortContigId());
173 TAlignIter alignItEnd = upperBoundAlignedReads(fragStore.alignedReadStore, contigId, SortContigId());
174 TSize numRead = 0;
175 TReadPos begClr = 0;
176 TReadPos endClr = 0;
177 TSize lenRead = 0;
178 TSize offset = 0;
179 for(;alignIt != alignItEnd; ++alignIt) {
180 offset = _min(alignIt->beginPos, alignIt->endPos);
181 getClrRange(fragStore, *alignIt, begClr, endClr);
182 strSet[numRead] = infix(fragStore.readSeqStore[alignIt->readId], begClr, endClr);
183 lenRead = endClr - begClr;
184 if (alignIt->beginPos < alignIt->endPos) appendValue(startEndPos, TPosPair(offset, offset + lenRead), Generous());
185 else {
186 reverseComplement(strSet[numRead]);
187 appendValue(startEndPos, TPosPair(offset + lenRead, offset), Generous());
188 }
189 ++numRead;
190 }
191 resize(strSet, numRead, Exact());
192 }
193
194
195
196
197 //////////////////////////////////////////////////////////////////////////////
198
199 template<typename TSpec, typename TConfig, typename TMatrix, typename TSize2, typename TSize, typename TReadSlot>
200 inline bool
convertAlignment(FragmentStore<TSpec,TConfig> & fragStore,TMatrix & mat,TSize2 contigId,TSize & coverage,TReadSlot & slot)201 convertAlignment(FragmentStore<TSpec, TConfig>& fragStore,
202 TMatrix& mat,
203 TSize2 contigId,
204 TSize& coverage,
205 TReadSlot& slot)
206 {
207 typedef FragmentStore<TSpec, TConfig> TFragmentStore;
208 typedef typename Value<TMatrix>::Type TValue;
209 typedef typename TFragmentStore::TContigPos TContigPos;
210
211 // Gap char
212 TValue gapChar = gapValue<TValue>();
213
214 // Sort according to contigId
215 sortAlignedReads(fragStore.alignedReadStore, SortContigId());
216
217 // Find range of the given contig
218 typedef typename Iterator<typename TFragmentStore::TAlignedReadStore, Standard>::Type TAlignIter;
219 TAlignIter alignIt = lowerBoundAlignedReads(fragStore.alignedReadStore, contigId, SortContigId());
220 TAlignIter alignItEnd = upperBoundAlignedReads(fragStore.alignedReadStore, contigId, SortContigId());
221
222 // Sort the reads according to the begin position
223 sortAlignedReads(infix(fragStore.alignedReadStore, alignIt - begin(fragStore.alignedReadStore, Standard()), alignItEnd - begin(fragStore.alignedReadStore, Standard())), SortBeginPos());
224 TAlignIter alignItBegin = alignIt = lowerBoundAlignedReads(fragStore.alignedReadStore, contigId, SortContigId());
225 alignItEnd = upperBoundAlignedReads(fragStore.alignedReadStore, contigId, SortContigId());
226
227 // Get the maximum coverage and the slot for each read
228 typedef String<TSize> TFirstFreePos;
229 typedef typename Iterator<TFirstFreePos, Standard>::Type TPosIter;
230 TFirstFreePos freePos;
231 TSize pos = 0;
232 TSize maxTmp = 0;
233 TSize numCol = 0;
234 reserve(slot, alignItEnd - alignIt);
235 for(;alignIt != alignItEnd; ++alignIt) {
236 TPosIter itPos = begin(freePos, Standard());
237 TPosIter itPosEnd = end(freePos, Standard());
238 pos = 0;
239 for(;itPos != itPosEnd; ++itPos, ++pos)
240 if ((TContigPos)*itPos < _min(alignIt->beginPos, alignIt->endPos)) break;
241 if (pos + 1 > length(freePos)) resize(freePos, pos+1, Generous());
242 maxTmp = _max(alignIt->beginPos, alignIt->endPos);
243 freePos[pos] = maxTmp;
244 if (maxTmp > numCol) numCol = maxTmp;
245 appendValue(slot, pos);
246 }
247 coverage = length(freePos);
248 clear(freePos);
249
250 // Fill the matrix
251 typedef typename Iterator<TMatrix, Standard>::Type TMatIter;
252 resize(mat, coverage * numCol, '.');
253 alignIt = alignItBegin;
254 TSize readPos = 0;
255 TMatIter matIt = begin(mat, Standard());
256 typename TFragmentStore::TReadSeq myRead;
257 for(;alignIt != alignItEnd; ++alignIt, ++readPos) {
258 typedef typename Iterator<String<typename TFragmentStore::TReadGapAnchor>, Standard>::Type TReadGapsIter;
259 TReadGapsIter itGaps = begin(alignIt->gaps, Standard());
260 TReadGapsIter itGapsEnd = end(alignIt->gaps, Standard());
261
262 // Place each read inside the matrix
263 myRead = fragStore.readSeqStore[alignIt->readId];
264 TSize lenRead = length(myRead);
265 TSize offset = alignIt->beginPos;
266 if (alignIt->beginPos > alignIt->endPos) {
267 reverseComplement(myRead);
268 offset = alignIt->endPos;
269 }
270 matIt = begin(mat, Standard());
271 matIt += (slot[readPos] * numCol + offset);
272
273 typedef typename Iterator<typename TFragmentStore::TReadSeq, Standard>::Type TReadIter;
274 TReadIter seqReadIt = begin(myRead, Standard());
275
276 // First clear range
277 TSize mySeqPos = 0;
278 int diff = 0;
279 if ((itGaps != itGapsEnd) && (itGaps->gapPos == 0)) {
280 mySeqPos = itGaps->seqPos;
281 diff = -1 * mySeqPos;
282 seqReadIt += mySeqPos;
283 }
284 TSize clr2 = lenRead;
285 TSize stop = 0;
286 for(;itGaps != itGapsEnd; ++itGaps) {
287 // Any clipped sequence at the end
288 stop = itGaps->seqPos;
289 if (diff - ((int) itGaps->gapPos - (int) itGaps->seqPos) > 0)
290 clr2 = stop = lenRead - (diff - ((int) itGaps->gapPos - (int) itGaps->seqPos));
291
292 for(;mySeqPos < stop; ++matIt, ++seqReadIt, ++mySeqPos)
293 *matIt = *seqReadIt;
294
295 for(int i = 0; i < ((int) itGaps->gapPos - (int) itGaps->seqPos) - diff; ++i, ++matIt)
296 *matIt = gapChar;
297
298 diff = (itGaps->gapPos - itGaps->seqPos);
299 }
300 for(;mySeqPos < clr2; ++mySeqPos, ++seqReadIt, ++matIt)
301 *matIt = *seqReadIt;
302 }
303 //for(TSize row = 0; row < coverage; ++row) {
304 // for(TSize col = 0; col<numCol; ++col) {
305 // std::cout << mat[row * numCol + col];
306 // }
307 // std::cout << std::endl;
308 //}
309 return true;
310 }
311
312 //////////////////////////////////////////////////////////////////////////////
313
314 template<typename TSpec, typename TConfig, typename TMatrix, typename TSize2, typename TSize>
315 inline bool
convertAlignment(FragmentStore<TSpec,TConfig> & fragStore,TMatrix & mat,TSize2 contigId,TSize & coverage)316 convertAlignment(FragmentStore<TSpec, TConfig>& fragStore,
317 TMatrix& mat,
318 TSize2 contigId,
319 TSize& coverage)
320 {
321 String<TSize> slot;
322 return convertAlignment(fragStore, mat, contigId, coverage, slot);
323 }
324
325 //////////////////////////////////////////////////////////////////////////////
326
327 template<typename TSpec, typename TConfig, typename TMatrix>
328 inline bool
convertAlignment(FragmentStore<TSpec,TConfig> & fragStore,TMatrix & mat)329 convertAlignment(FragmentStore<TSpec, TConfig>& fragStore,
330 TMatrix& mat)
331 {
332 typedef FragmentStore<TSpec, TConfig> TFragmentStore;
333 typedef typename Size<TFragmentStore>::Type TSize;
334 TSize coverage;
335 return convertAlignment(fragStore, mat, 0, coverage);
336 }
337
338 //////////////////////////////////////////////////////////////////////////////
339
340 template<typename TSpec, typename TConfig, typename TGappedConsensus, typename TSize>
341 inline void
getGappedConsensus(FragmentStore<TSpec,TConfig> & fragStore,TGappedConsensus & gappedConsensus,TSize contigId)342 getGappedConsensus(FragmentStore<TSpec, TConfig>& fragStore,
343 TGappedConsensus& gappedConsensus,
344 TSize contigId)
345 {
346 typedef FragmentStore<TSpec, TConfig> TFragmentStore;
347 typedef typename Value<TGappedConsensus>::Type TValue;
348 typedef typename TFragmentStore::TContigPos TContigPos;
349
350 TValue gapChar = gapValue<TValue>();
351 typedef typename Iterator<typename TFragmentStore::TContigSeq, Standard>::Type TContigIter;
352 TContigIter seqContigIt = begin(fragStore.contigStore[contigId].seq, Standard());
353 TContigIter seqContigItEnd = end(fragStore.contigStore[contigId].seq, Standard());
354 typedef typename Iterator<String<typename TFragmentStore::TContigGapAnchor>, Standard>::Type TGapsIter;
355 TGapsIter itGaps = begin(fragStore.contigStore[contigId].gaps, Standard());
356 TGapsIter itGapsEnd = end(fragStore.contigStore[contigId].gaps, Standard());
357 int diff = 0;
358 TContigPos mySeqPos = 0;
359 for(;itGaps != itGapsEnd; goNext(itGaps)) {
360 for(;mySeqPos < itGaps->seqPos; ++seqContigIt, ++mySeqPos)
361 appendValue(gappedConsensus, *seqContigIt, Generous());
362
363 for(int i = 0; i < ((int) itGaps->gapPos - (int) itGaps->seqPos) - diff; ++i)
364 appendValue(gappedConsensus, gapChar, Generous());
365 //NOTE(h-2): the next line was previously indented although it belonged to the for loop
366 // in case of unexpected behaviour, enclose these statements in {}
367 diff = (itGaps->gapPos - itGaps->seqPos);
368 }
369 for(;seqContigIt != seqContigItEnd; ++seqContigIt)
370 appendValue(gappedConsensus, *seqContigIt, Generous());
371 }
372
373
374 //////////////////////////////////////////////////////////////////////////////
375
376 template<typename TSpec, typename TConfig, typename TGappedConsensus, typename TSize>
377 inline void
assignGappedConsensus(FragmentStore<TSpec,TConfig> & fragStore,TGappedConsensus & gappedCons,TSize contigId)378 assignGappedConsensus(FragmentStore<TSpec, TConfig>& fragStore,
379 TGappedConsensus& gappedCons,
380 TSize contigId)
381 {
382 typedef FragmentStore<TSpec, TConfig> TFragmentStore;
383 typedef typename Value<TGappedConsensus>::Type TValue;
384 TValue gapChar = gapValue<TValue>();
385
386 // Update the contig
387 typedef typename Value<typename TFragmentStore::TContigStore>::Type TContigStoreElement;
388 TContigStoreElement& contigEl = fragStore.contigStore[contigId];
389 clear(contigEl.gaps);
390 clear(contigEl.seq);
391
392 // Create the sequence and the gap anchors
393 typedef typename Iterator<TGappedConsensus, Standard>::Type TStringIter;
394 TStringIter seqIt = begin(gappedCons, Standard());
395 TStringIter seqItEnd = end(gappedCons, Standard());
396 typedef typename TFragmentStore::TReadPos TReadPos;
397 typedef typename TFragmentStore::TContigGapAnchor TContigGapAnchor;
398 TReadPos ungappedPos = 0;
399 TReadPos gappedPos = 0;
400 bool gapOpen = false;
401 for(;seqIt != seqItEnd; ++seqIt, ++gappedPos) {
402 if (*seqIt == gapChar) gapOpen = true;
403 else {
404 if (gapOpen) {
405 appendValue(contigEl.gaps, TContigGapAnchor(ungappedPos, gappedPos), Generous());
406 gapOpen = false;
407 }
408 Dna5Q letter = *seqIt;
409 assignQualityValue(letter, 'D');
410 appendValue(contigEl.seq, letter);
411 ++ungappedPos;
412 }
413 }
414 if (gapOpen)
415 appendValue(contigEl.gaps, TContigGapAnchor(ungappedPos, gappedPos), Generous());
416 }
417
418
419
420 //////////////////////////////////////////////////////////////////////////////////
421
422 /*
423 * @fn consensusAlignment
424 * @headerfile <seqan/consensus.h>
425 * @brief Compute consensus alignment.
426 *
427 * @signature void consensusAlignment(alignmentGraph, beginEndPos[, options])
428 *
429 * @param[out] alignmentGraph Alignment graph to build.
430 * @param[in] options Optional settings for the consenus alignment, type: <tt>ConsensusOptions</tt>.
431 * @param[in] beginEndPos Interval start and end position for the read's alignment; <tt>String<Pair<TPos, TPos> ></tt>.
432 *
433 * @section Example
434 *
435 * @code{.cpp}
436 * #include <seqan/sequence.h>
437 * #include <seqan/graph_align.h>
438 * #include <seqan/consensus.h>
439 *
440 * int main()
441 * {
442 * using namespace seqan;
443 *
444 * typedef StringSet<Dna5String> TStringSet;
445 * typedef Graph<Alignment<TStringSet, void, WithoutEdgeId> > TAlignGraph;
446 *
447 * TStringSet readSet;
448 * String<Pair<TSize> > begEndPos;
449 *
450 * appendValue(readSet, "CCCAGTGA");
451 * appendValue(begEndPos, Pair<TSize>(0, 5));
452 * appendValue(readSet, "AGGGACTGT");
453 * appendValue(begEndPos, Pair<TSize>(3, 9));
454 *
455 * TAlignGraph alignmentGraph(readSet);
456 * consensusAlignment(alignmentGraph, begEndPos);
457 *
458 * return 0;
459 * }
460 * @endcode
461 */
462
463 template<typename TStringSet, typename TCargo, typename TSpec, typename TSize, typename TConfigOptions>
464 inline void
consensusAlignment(Graph<Alignment<TStringSet,TCargo,TSpec>> & gOut,String<Pair<TSize,TSize>> & begEndPos,TConfigOptions const & consOpt)465 consensusAlignment(Graph<Alignment<TStringSet, TCargo, TSpec> >& gOut,
466 String<Pair<TSize, TSize> >& begEndPos,
467 TConfigOptions const& consOpt)
468 {
469 typedef Graph<Alignment<TStringSet, TCargo, TSpec> > TOutGraph;
470 typedef typename Id<TOutGraph>::Type TId;
471
472 // Initialization
473 TStringSet& seqSet = stringSet(gOut);
474
475 // Select all overlapping reads and record the diagonals of the band
476 String<Pair<TId, TId> > pList;
477 String<Pair<int, int> > diagList;
478 if (consOpt.window == 0)
479 selectPairsAssembly(seqSet, begEndPos, consOpt.bandwidth, pList, diagList);
480 else
481 selectPairsAllAgainstAll(seqSet, begEndPos, consOpt.window, pList, diagList);
482
483 // Set-up a sparse distance matrix
484 Graph<Undirected<double> > pairGraph;
485
486 // Containers for segment matches and corresponding scores
487 typedef String<Fragment<> > TFragmentString;
488 TFragmentString matches;
489 typedef String<int> TScoreValues;
490 TScoreValues scores;
491
492 // Compute segment matches from global pairwise alignments
493 appendSegmentMatches(seqSet, pList, diagList, begEndPos, consOpt.sc, consOpt.matchlength, consOpt.quality, consOpt.overlaps, matches, scores, pairGraph, OverlapLibrary() );
494 clear(pList);
495 clear(diagList);
496
497 // If there are no alignment matches, return
498 if (!length(matches)) return;
499
500 // Use these segment matches for the initial alignment graph
501 typedef Graph<Alignment<TStringSet, TSize> > TGraph;
502 TGraph g(seqSet);
503 buildAlignmentGraph(matches, scores, g, consOpt.sc, ReScore() );
504 clear(matches);
505 clear(scores);
506
507 // Guide Tree
508 Graph<Tree<double> > guideTree;
509 upgmaTree(pairGraph, guideTree);
510 clear(pairGraph);
511
512 // Triplet library extension
513 graphBasedTripletLibraryExtension(g);
514
515 // Perform a progressive alignment
516 progressiveAlignment(g, guideTree, gOut);
517 clear(g);
518 clear(guideTree);
519 }
520
521 //////////////////////////////////////////////////////////////////////////////////
522
523
524 template<typename TStringSet, typename TCargo, typename TSpec, typename TSize>
525 inline void
consensusAlignment(Graph<Alignment<TStringSet,TCargo,TSpec>> & gOut,String<Pair<TSize,TSize>> & begEndPos)526 consensusAlignment(Graph<Alignment<TStringSet, TCargo, TSpec> >& gOut,
527 String<Pair<TSize, TSize> >& begEndPos)
528 {
529 ConsensusOptions consOpt;
530 consensusAlignment(gOut, begEndPos, consOpt);
531 }
532
533
534 //////////////////////////////////////////////////////////////////////////////
535
536 template <typename TFragSpec, typename TConfig, typename TStringSet, typename TCargo, typename TSpec, typename TContigId>
537 inline void
updateContig(FragmentStore<TFragSpec,TConfig> & fragStore,Graph<Alignment<TStringSet,TCargo,TSpec>> const & g,TContigId contigId)538 updateContig(FragmentStore<TFragSpec, TConfig>& fragStore,
539 Graph<Alignment<TStringSet, TCargo, TSpec> > const& g,
540 TContigId contigId)
541 {
542 typedef Graph<Alignment<TStringSet, TCargo, TSpec> > TGraph;
543 typedef typename Size<TGraph>::Type TSize;
544 typedef typename Value<TStringSet>::Type TString;
545 typedef typename VertexDescriptor<TGraph>::Type TVertexDescriptor;
546 typedef std::map<TSize, TSize> TComponentLength;
547 typedef char TValue;
548
549 // Initialization
550 TStringSet& strSet = stringSet(g);
551 TSize nseq = length(strSet);
552 TValue gapChar = gapValue<TValue>();
553 TValue specialGap = '.';
554 TSize maxCoverage = 0;
555 TSize len = 0;
556 String<TValue> mat;
557
558 // Store for each read the begin position, the end position and the row in the alignment matrix
559 String<TSize> readBegEndRowPos;
560 resize(readBegEndRowPos, 3*nseq);
561
562 // Strongly Connected Components, topological sort, and length of each component
563 String<TSize> component;
564 String<TSize> order;
565 TComponentLength compLength;
566 if (convertAlignment(g, component, order, compLength)) {
567 TSize numOfComponents = length(order);
568
569 // Assign to each sequence the start and end (in terms of component ranks)
570 typedef String<std::pair<TSize, TSize> > TComponentToRank;
571 TComponentToRank compToRank;
572 for(TSize compIndex = 0; compIndex < numOfComponents; ++compIndex)
573 appendValue(compToRank, std::make_pair(order[compIndex], compIndex), Generous());
574 std::sort(begin(compToRank, Standard()), end(compToRank, Standard()));
575
576 typedef Pair<TSize, TSize> TRankPair;
577 typedef String<TRankPair> TSequenceToRanks;
578 TSequenceToRanks seqToRank;
579 resize(seqToRank, nseq);
580 typedef typename Iterator<TGraph, VertexIterator>::Type TVertexIterator;
581 TVertexIterator itVertex(g);
582 for(;!atEnd(itVertex);++itVertex) {
583 TVertexDescriptor vert = value(itVertex);
584 TSize seq = idToPosition(strSet, sequenceId(g, vert));
585 if (fragmentBegin(g, vert) == 0)
586 seqToRank[seq].i1 = std::lower_bound(begin(compToRank, Standard()), end(compToRank, Standard()), std::make_pair((TSize) component[vert], (TSize) 0))->second;
587 if (fragmentBegin(g, vert) + fragmentLength(g, vert) == length(strSet[seq]))
588 seqToRank[seq].i2 = std::lower_bound(begin(compToRank, Standard()), end(compToRank, Standard()), std::make_pair((TSize) component[vert], (TSize) 0))->second;
589 }
590 clear(compToRank);
591
592 // Assign the sequences to rows
593 String<TSize> seqToRow;
594 resize(seqToRow, nseq);
595 maxCoverage = 0;
596 typedef String<bool> TLeftOver;
597 typedef typename Iterator<TLeftOver, Standard>::Type TLeftOverIter;
598 TLeftOver leftOver;
599 resize(leftOver, nseq, true);
600 typedef String<std::pair<TSize, TSize> > TSeqToBegin;
601 typedef typename Iterator<TSeqToBegin, Standard>::Type TSeqToBeginIter;
602 TSeqToBegin seqToBegin;
603 TSize finishedSeq = 0;
604 while(finishedSeq < nseq) {
605 TLeftOverIter itL = begin(leftOver, Standard());
606 TLeftOverIter itLEnd = end(leftOver, Standard());
607 for(TSize pos = 0; itL != itLEnd; ++itL, ++pos)
608 if (*itL) appendValue(seqToBegin, std::make_pair((seqToRank[pos]).i1, pos), Generous());
609 std::sort(begin(seqToBegin, Standard()), end(seqToBegin, Standard()));
610
611 TSize endPos = 0;
612 TSeqToBeginIter itSB = begin(seqToBegin, Standard());
613 TSeqToBeginIter itSBEnd = end(seqToBegin, Standard());
614 for(;itSB != itSBEnd;++itSB) {
615 if (endPos <= (*itSB).first) {
616 TSize currentSeq = (*itSB).second;
617 seqToRow[currentSeq] = maxCoverage;
618 endPos = (seqToRank[currentSeq]).i2 + 2;
619 leftOver[currentSeq] = false;
620 ++finishedSeq;
621 }
622 }
623 clear(seqToBegin);
624 ++maxCoverage;
625 }
626 clear(leftOver);
627
628 // Create the matrix
629 len = 0;
630 String<TSize> compOffset;
631 resize(compOffset, numOfComponents);
632 for(TSize compIndex = 0; compIndex < numOfComponents; ++compIndex) {
633 compOffset[order[compIndex]] = len;
634 len+=compLength[order[compIndex]];
635 }
636 resize(mat, len * maxCoverage, gapChar);
637
638 // Fill in the segments
639 typedef typename Infix<TString>::Type TInfix;
640 typedef typename Iterator<TInfix, Standard>::Type TInfixIter;
641 typedef typename TGraph::TPosToVertexMap_ TPosToVertexMap;
642 for(typename TPosToVertexMap::const_iterator it = g.data_pvMap.begin();it != g.data_pvMap.end(); ++it) {
643 TInfix str = label(g,it->second);
644 TSize c = property(component, it->second);
645 TSize row = seqToRow[idToPosition(strSet, it->first.first)];
646 //if (row == 0) {
647 // std::cout << sequenceId(g, it->second) << ':' << str << ',' << strSet[sequenceId(g, it->second)] << std::endl;
648 // std::cout << getProperty(component, it->second) << ',' << order[compIndex] << std::endl;
649 // std::cout << (seqToRank[sequenceId(g, it->second)]).i1 << ',' << (seqToRank[sequenceId(g, it->second)]).i2 << std::endl;
650 //}
651 TInfixIter sIt = begin(str, Standard());
652 TInfixIter sItEnd = end(str, Standard());
653 TSize i = compOffset[c];
654 for(TSize pCol = i;sIt!=sItEnd;++sIt, ++pCol, ++i)
655 mat[row * len + pCol] = *sIt;
656 }
657 String<bool> active;
658 for(TSize compIndex = 0; compIndex < numOfComponents; ++compIndex) {
659 TSize offset = compOffset[order[compIndex]];
660 TSize currentCompLength = compLength[order[compIndex]];
661
662 clear(active);
663 resize(active, maxCoverage, false);
664
665 // Find the empty rows
666 for(TSize i=0;i<nseq; ++i) {
667 if (((seqToRank[i]).i1 <= compIndex) && ((seqToRank[i]).i2 >= compIndex))
668 active[(seqToRow[i])] = true;
669 }
670
671 // Substitute false gaps with special gap character
672 for(TSize i = 0; i < maxCoverage; ++i) {
673 if (!(active[i])) {
674 for(TSize pCol = offset;pCol < offset + currentCompLength;++pCol)
675 mat[i * len + pCol] = specialGap;
676 }
677 }
678 }
679
680 // Get the new begin and end positions
681 for(TSize i=0;i<nseq; ++i) {
682 TVertexDescriptor lastVertex = findVertex(const_cast<TGraph&>(g), positionToId(strSet, i), length(strSet[i]) - 1);
683 TSize readBegin = compOffset[getProperty(component, findVertex(const_cast<TGraph&>(g), positionToId(strSet, i), 0))];
684 TSize readEnd = compOffset[getProperty(component, lastVertex)] + fragmentLength(const_cast<TGraph&>(g), lastVertex);
685 readBegEndRowPos[3*i] = readBegin;
686 readBegEndRowPos[3*i+1] = readEnd;
687 readBegEndRowPos[3*i+2] = seqToRow[i];
688 }
689
690 }
691 clear(component);
692 clear(order);
693 compLength.clear();
694
695
696 //// Debug code
697 //for(TSize row = 0; row<maxCoverage; ++row) {
698 // for(TSize col = 0; col<len; ++col) {
699 // std::cout << mat[row * len + col];
700 // }
701 // std::cout << std::endl;
702 //}
703
704 // Create the new consensus
705 typedef typename Value<TString>::Type TAlphabet;
706 String<TValue> gappedCons;
707 consensusCalling(mat, gappedCons, maxCoverage, TAlphabet(), MajorityVote());
708
709 // Assign new consensus
710 assignGappedConsensus(fragStore, gappedCons, contigId);
711
712 // Update all aligned reads
713 typedef FragmentStore<TSpec, TConfig> TFragmentStore;
714 typedef typename TFragmentStore::TReadPos TReadPos;
715 typedef typename TFragmentStore::TContigPos TContigPos;
716 typedef typename TFragmentStore::TContigGapAnchor TContigGapAnchor;
717 //typedef typename Value<typename TFragmentStore::TAlignedReadStore>::Type TAlignedElement;
718 typedef typename Iterator<typename TFragmentStore::TAlignedReadStore>::Type TAlignIter;
719 sortAlignedReads(fragStore.alignedReadStore, SortContigId());
720 TAlignIter alignIt = lowerBoundAlignedReads(fragStore.alignedReadStore, contigId, SortContigId());
721 TAlignIter alignItEnd = upperBoundAlignedReads(fragStore.alignedReadStore, contigId, SortContigId());
722 TReadPos ungappedPos = 0;
723 TReadPos gappedPos = 0;
724 bool gapOpen;
725 for(TSize i = 0;alignIt != alignItEnd; ++alignIt, ++i) {
726 TSize lenRead = length(fragStore.readSeqStore[alignIt->readId]);
727 TReadPos begClr = 0;
728 TReadPos endClr = 0;
729 getClrRange(fragStore, *alignIt, begClr, endClr);
730 clear(alignIt->gaps);
731 ungappedPos = begClr;
732 if (alignIt->beginPos > alignIt->endPos) ungappedPos = lenRead - endClr;
733 if (ungappedPos != 0) appendValue(alignIt->gaps, TContigGapAnchor(ungappedPos, 0));
734 gappedPos = 0;
735 gapOpen = false;
736 for(TSize column = readBegEndRowPos[3*i]; column<readBegEndRowPos[3*i + 1]; ++column, ++gappedPos) {
737 if (mat[readBegEndRowPos[3*i + 2] * len + column] == gapChar) gapOpen = true;
738 else {
739 if (gapOpen) {
740 appendValue(alignIt->gaps, TContigGapAnchor(ungappedPos, gappedPos), Generous());
741 gapOpen = false;
742 }
743 ++ungappedPos;
744 }
745 }
746 if (gapOpen) appendValue(alignIt->gaps, TContigGapAnchor(ungappedPos, gappedPos), Generous());
747 if (alignIt->beginPos < alignIt->endPos) {
748 if (endClr != (TContigPos)lenRead)
749 appendValue(alignIt->gaps, TContigGapAnchor(lenRead, lenRead + (gappedPos - ungappedPos) - (lenRead - endClr)), Generous());
750 } else {
751 if (begClr != 0)
752 appendValue(alignIt->gaps, TContigGapAnchor(lenRead, lenRead + (gappedPos - ungappedPos) - begClr), Generous());
753 }
754
755 // Set new begin and end position
756 if (alignIt->beginPos < alignIt->endPos) {
757 alignIt->beginPos = readBegEndRowPos[3*i];
758 alignIt->endPos = readBegEndRowPos[3*i+1];
759 } else {
760 alignIt->beginPos = readBegEndRowPos[3*i+1];
761 alignIt->endPos = readBegEndRowPos[3*i];
762 }
763 }
764 }
765
766
767 //////////////////////////////////////////////////////////////////////////////
768
769 template <typename TValue, typename TSpec, typename TCounters, typename TSize, typename TAlphabet>
770 inline void
_countLetters(String<TValue,TSpec> const & mat,TCounters & counterValues,TSize alignDepth,TAlphabet)771 _countLetters(String<TValue, TSpec> const & mat,
772 TCounters & counterValues,
773 TSize alignDepth,
774 TAlphabet)
775 {
776 typedef String<TValue, TSpec> const TMatrix;
777 typedef typename Iterator<TMatrix, Standard>::Type TMatIter;
778
779 // Initialization
780 TSize len = length(mat) / alignDepth;
781 TValue gapChar = gapValue<TValue>();
782 TValue specialGap = '.';
783 TSize alphabetSize = ValueSize<TAlphabet>::VALUE;
784
785 // Set-up counter values
786 typedef typename Value<TCounters>::Type TCounter;
787 typedef typename Iterator<TCounters, Standard>::Type TCounterIt;
788 resize(counterValues, len);
789 for(TSize i=0;i<len; ++i) {
790 TCounter counter;
791 resize(counter, alphabetSize + 1, 0);
792 counterValues[i] = counter;
793 }
794
795 // Count all
796 TMatIter matIt = begin(mat, Standard());
797 TMatIter matItEnd = end(mat, Standard());
798 TCounterIt countIt = begin(counterValues, Standard());
799 TSize pos = 0;
800 for(; matIt != matItEnd; ++matIt, ++countIt, ++pos) {
801 if (pos % len == 0) countIt = begin(counterValues, Standard());
802 if (*matIt != specialGap) {
803 if (*matIt == gapChar) ++value(*countIt, alphabetSize);
804 else ++value(*countIt, ordValue((TAlphabet) *matIt));
805 }
806 }
807 }
808
809
810 //////////////////////////////////////////////////////////////////////////////
811
812 template <typename TValue, typename TSpec, typename TGappedCons, typename TAlignDepth, typename TAlphabet>
813 inline void
consensusCalling(String<TValue,TSpec> const & mat,TGappedCons & gappedConsensus,TAlignDepth maxCoverage,TAlphabet,Bayesian)814 consensusCalling(String<TValue, TSpec> const& mat,
815 TGappedCons& gappedConsensus,
816 TAlignDepth maxCoverage,
817 TAlphabet,
818 Bayesian)
819 {
820 typedef double TProbability;
821 typedef String<TProbability> TProbabilityDistribution;
822 typedef String<TProbabilityDistribution> TPositionalPrDist;
823 typedef typename Iterator<TPositionalPrDist, Standard>::Type TPosPrDistIter;
824
825 typedef typename Size<String<TValue, TSpec> >::Type TSize;
826 TSize alphabetSize = ValueSize<TAlphabet>::VALUE;
827 TValue gapChar = gapValue<TValue>();
828 TValue specialGap = '.';
829
830 // Set-up the counters
831 typedef String<TSize> TCounter;
832 typedef String<TCounter> TCounters;
833 TCounters counterValues;
834 _countLetters(mat, counterValues, maxCoverage, TAlphabet() );
835
836
837 // Initialization
838 TSize len = length(mat) / maxCoverage;
839 TProbabilityDistribution backroundDist;
840 resize(backroundDist, alphabetSize + 1, ((TProbability) 1 / (TProbability) (alphabetSize + 1)));
841
842 // Get an initial consensus
843 typedef typename Iterator<TCounters, Standard>::Type TCounterIt;
844 TCounterIt countIt = begin(counterValues, Standard());
845 TCounterIt countItEnd = end(counterValues, Standard());
846 TPositionalPrDist posPrDist;
847 TValue c = TAlphabet();
848 for(;countIt != countItEnd; ++countIt) {
849 TSize max = 0;
850 typedef typename Iterator<TCounter, Standard>::Type TCIt;
851 TCIt cIt = begin(*countIt, Standard());
852 TCIt cItEnd = end(*countIt, Standard());
853 TSize pos = 0;
854 for(;cIt != cItEnd; ++cIt, ++pos) {
855 if (*cIt > max) {
856 max = *cIt;
857 c = (pos == alphabetSize) ? gapChar : (TValue) TAlphabet(pos);
858 }
859 }
860 TProbabilityDistribution prDist;
861 resize(prDist, alphabetSize + 1, 0);
862 if (c == gapChar) prDist[alphabetSize] = 1;
863 else prDist[ordValue((TAlphabet) c)] = 1;
864 appendValue(posPrDist, prDist, Generous());
865 }
866
867 TSize run = 1;
868 TProbabilityDistribution pI;
869 TProbabilityDistribution pIJ;
870 TProbabilityDistribution pIOld;
871 TProbabilityDistribution pIJOld;
872 while (run) {
873 // Store the values from the last iteration
874 pIOld = pI;
875 pIJOld = pIJ;
876
877 // Count all letters in the consensus
878 TProbabilityDistribution nI;
879 resize(nI, alphabetSize + 1, 0);
880 TPosPrDistIter itPosPrDist = begin(posPrDist, Standard());
881 TPosPrDistIter itPosPrDistEnd = end(posPrDist, Standard());
882 for(;itPosPrDist!=itPosPrDistEnd; ++itPosPrDist)
883 for(TSize i = 0; i<(alphabetSize + 1); ++i)
884 nI[i] += (*itPosPrDist)[i];
885
886 // Composition probabilities
887 clear(pI);
888 resize(pI, alphabetSize + 1);
889 TProbability lenPosPrDist = (TProbability) length(posPrDist);
890 for(TSize i = 0; i<length(pI); ++i)
891 pI[i] = nI[i] / lenPosPrDist;
892
893
894 // Count all letters that agree / disagree with the consensus
895 TProbabilityDistribution nIJ;
896 resize(nIJ, (alphabetSize + 1) * (alphabetSize + 1), 0);
897 typedef String<TValue, TSpec> TMatrix;
898 typedef typename Iterator<TMatrix, Standard>::Type TMatIter;
899 TMatIter matIt = begin(mat, Standard());
900 TMatIter matItEnd = end(mat, Standard());
901 itPosPrDist = begin(posPrDist, Standard());
902 TSize pos = 0;
903 for(; matIt != matItEnd; ++matIt, ++itPosPrDist, ++pos) {
904 if (pos % len == 0) itPosPrDist = begin(posPrDist, Standard());
905 TValue c = *matIt;
906 if (c != specialGap) {
907 TSize fragJ = (c != gapChar) ? ordValue(TAlphabet(c)) : alphabetSize;
908 for(TSize consI = 0; consI<(alphabetSize + 1); ++consI)
909 nIJ[consI * (alphabetSize + 1) + fragJ] += (*itPosPrDist)[consI];
910 }
911 }
912
913 // Sequencing error probabilities
914 clear(pIJ);
915 resize(pIJ, (alphabetSize + 1) * (alphabetSize + 1));
916 TProbability sumIJ = 0;
917 for(TSize diag = 0; diag<(alphabetSize + 1); ++diag) sumIJ += nIJ[diag * (alphabetSize + 1) + diag];
918 for(TSize consI = 0; consI<(alphabetSize + 1); ++consI)
919 for(TSize fragJ = 0; fragJ<(alphabetSize + 1); ++fragJ)
920 pIJ[consI * (alphabetSize + 1) + fragJ] = nIJ[consI * (alphabetSize + 1) + fragJ] / sumIJ;
921
922 // Recompute positional probability distribution
923 itPosPrDist = begin(posPrDist, Standard());
924 TSize col = 0;
925 for(;itPosPrDist!=itPosPrDistEnd; ++itPosPrDist, ++col) {
926 TProbabilityDistribution prDist;
927 resize(prDist, alphabetSize + 1);
928 for(TSize consI = 0; consI<(alphabetSize + 1); ++consI) {
929 TProbability numerator = pI[consI];
930 TProbability denominator = 0;
931 for(TSize allI = 0; allI<(alphabetSize + 1); ++allI) {
932 TProbability denominatorSub = value(pI, allI);
933 for(TSize row = 0; row < maxCoverage; ++row) {
934 TValue c = mat[row * len + col];
935 if (c != specialGap) {
936 TSize fragJ = (c != gapChar) ? ordValue(TAlphabet(c)) : alphabetSize;
937 if (allI == consI)
938 numerator *= pIJ[allI * (alphabetSize + 1) + fragJ];
939 denominatorSub *= pIJ[allI * (alphabetSize + 1) + fragJ];
940 }
941 }
942 denominator += denominatorSub;
943 }
944 prDist[consI] = numerator / denominator;
945 }
946 *itPosPrDist = prDist;
947 }
948
949 // Check termination criterion
950 TProbability eps = 0.00001;
951 typedef typename Iterator<TProbabilityDistribution, Standard>::Type TProbIter;
952 TProbIter pIter = begin(pIOld, Standard());
953 TProbIter pIterCompare = begin(pI, Standard());
954 TProbIter pIterEnd = end(pIOld, Standard());
955 TSize runOld = run;
956 for(;pIter != pIterEnd; ++pIter, ++pIterCompare) {
957 if (*pIter > *pIterCompare) {
958 if (*pIter - *pIterCompare > eps) {
959 ++run;
960 break;
961 }
962 } else {
963 if (*pIterCompare - *pIter > eps) {
964 ++run;
965 break;
966 }
967 }
968 }
969 if (runOld == run) {
970 pIter = begin(pIJOld, Standard());
971 pIterCompare = begin(pIJ, Standard());
972 pIterEnd = end(pIJOld, Standard());
973 for(;pIter != pIterEnd; ++pIter, ++pIterCompare) {
974 if (*pIter > *pIterCompare) {
975 if (*pIter - *pIterCompare > eps) {
976 ++run;
977 break;
978 }
979 } else {
980 if (*pIterCompare - *pIter > eps) {
981 ++run;
982 break;
983 }
984 }
985 }
986 }
987
988 if (runOld == run) {
989 std::cout << "Iterations: " << run << std::endl;
990 run = 0;
991 }
992 }
993
994 // Compute the most likely consensus
995 TPosPrDistIter itPosPrDist = begin(posPrDist, Standard());
996 TPosPrDistIter itPosPrDistEnd = end(posPrDist, Standard());
997 clear(gappedConsensus);
998 for(;itPosPrDist!=itPosPrDistEnd; ++itPosPrDist) {
999 TProbability max = 0;
1000 TSize ind = 0;
1001 for(TSize consI = 0; consI<(alphabetSize + 1); ++consI) {
1002 if ((*itPosPrDist)[consI] > max) {
1003 max = (*itPosPrDist)[consI];
1004 ind = consI;
1005 }
1006 }
1007 if (ind == alphabetSize) appendValue(gappedConsensus, gapChar);
1008 else appendValue(gappedConsensus, TAlphabet(ind));
1009 }
1010
1011 }
1012
1013 //////////////////////////////////////////////////////////////////////////////
1014
1015
1016 template <typename TFragSpec, typename TConfig, typename TContigId>
1017 inline void
consensusCalling(FragmentStore<TFragSpec,TConfig> & fragStore,TContigId contigId,Bayesian)1018 consensusCalling(FragmentStore<TFragSpec, TConfig>& fragStore,
1019 TContigId contigId,
1020 Bayesian)
1021 {
1022
1023 typedef FragmentStore<TFragSpec, TConfig> TFragmentStore;
1024 typedef typename Size<TFragmentStore>::Type TSize;
1025 typedef typename TFragmentStore::TReadSeq TReadSeq;
1026 typedef typename Value<TReadSeq>::Type TAlphabet;
1027 typedef char TValue;
1028
1029 // Convert the contig to an alignment matrix
1030 typedef String<TValue> TAlignMat;
1031 TAlignMat mat;
1032 TSize maxCoverage;
1033 convertAlignment(fragStore, mat, contigId, maxCoverage);
1034
1035 // Call the consensus
1036 String<TValue> gappedConsensus;
1037 consensusCalling(mat, gappedConsensus, maxCoverage, TAlphabet(), Bayesian());
1038
1039 // Assign the new consensus
1040 assignGappedConsensus(fragStore, gappedConsensus, contigId);
1041 }
1042
1043 //////////////////////////////////////////////////////////////////////////////
1044
1045 template <typename TValue, typename TSpec, typename TGappedCons, typename TAlignDepth, typename TAlphabet>
1046 inline void
consensusCalling(String<TValue,TSpec> const & mat,TGappedCons & gappedConsensus,TAlignDepth maxCoverage,TAlphabet,MajorityVote)1047 consensusCalling(String<TValue, TSpec> const& mat,
1048 TGappedCons& gappedConsensus,
1049 TAlignDepth maxCoverage,
1050 TAlphabet,
1051 MajorityVote)
1052 {
1053 typedef typename Size<String<TValue, TSpec> >::Type TSize;
1054 TSize alphabetSize = ValueSize<TAlphabet>::VALUE;
1055 TValue gapChar = gapValue<TValue>();
1056
1057 // Set-up the counters
1058 typedef String<TSize> TCounter;
1059 typedef String<TCounter> TCounters;
1060 TCounters counterValues;
1061 _countLetters(mat, counterValues, maxCoverage, TAlphabet() );
1062
1063 // Get the consensus
1064 typedef typename Iterator<TCounters, Standard>::Type TCounterIt;
1065 TCounterIt countIt = begin(counterValues, Standard());
1066 TCounterIt countItEnd = end(counterValues, Standard());
1067 clear(gappedConsensus);
1068 TSize max = 0;
1069 TValue c = TValue();
1070 TSize pos = 0;
1071 for(;countIt != countItEnd; ++countIt) {
1072 max = 0;
1073 typedef typename Iterator<TCounter, Standard>::Type TCIt;
1074 TCIt cIt = begin(*countIt, Standard());
1075 TCIt cItEnd = end(*countIt, Standard());
1076 pos = 0;
1077 for(;cIt != cItEnd; ++cIt, ++pos) {
1078 if (*cIt > max) {
1079 max = *cIt;
1080 c = (pos == alphabetSize) ? gapChar : (TValue) TAlphabet(pos);
1081 }
1082 }
1083 appendValue(gappedConsensus, c);
1084 }
1085 }
1086
1087
1088 //////////////////////////////////////////////////////////////////////////////
1089
1090
1091 template <typename TFragSpec, typename TConfig, typename TContigId>
1092 inline void
consensusCalling(FragmentStore<TFragSpec,TConfig> & fragStore,TContigId contigId,MajorityVote)1093 consensusCalling(FragmentStore<TFragSpec, TConfig>& fragStore,
1094 TContigId contigId,
1095 MajorityVote)
1096 {
1097 typedef FragmentStore<TFragSpec, TConfig> TFragmentStore;
1098 typedef typename Size<TFragmentStore>::Type TSize;
1099 typedef typename TFragmentStore::TReadSeq TReadSeq;
1100 typedef typename Value<TReadSeq>::Type TAlphabet;
1101 typedef char TValue;
1102
1103 // Convert the contig to an alignment matrix
1104 typedef String<TValue> TAlignMat;
1105 TAlignMat mat;
1106 TSize maxCoverage;
1107 convertAlignment(fragStore, mat, contigId, maxCoverage);
1108
1109 // Call the consensus
1110 String<TValue> gappedConsensus;
1111 consensusCalling(mat, gappedConsensus, maxCoverage, TAlphabet(), MajorityVote());
1112
1113 // Assign the new consensus
1114 assignGappedConsensus(fragStore, gappedConsensus, contigId);
1115 }
1116
1117
1118 //////////////////////////////////////////////////////////////////////////////
1119
1120
1121 //////////////////////////////////////////////////////////////////////////////
1122 // Old proprietary FastaReadFormat
1123 //////////////////////////////////////////////////////////////////////////////
1124
1125 //////////////////////////////////////////////////////////////////////////////
1126
1127 struct FastaReadFormat_;
1128 typedef Tag<FastaReadFormat_> const FastaReadFormat;
1129
1130 //////////////////////////////////////////////////////////////////////////////
1131
1132 template<typename TFile, typename TSpec, typename TConfig>
1133 inline void
write(TFile & file,FragmentStore<TSpec,TConfig> & fragStore,FastaReadFormat)1134 write(
1135 TFile & file,
1136 FragmentStore<TSpec, TConfig>& fragStore,
1137 FastaReadFormat)
1138 {
1139 //IOREV _nodoc_ what has this got to do with consensus? why is it here?
1140 // Basic types
1141 typedef FragmentStore<TSpec, TConfig> TFragmentStore;
1142 typedef typename Size<TFragmentStore>::Type TSize;
1143 typedef typename TFragmentStore::TContigPos TContigPos;
1144 //typedef typename Value<TFile>::Type TValue;
1145 typedef char TMultiReadChar;
1146 TMultiReadChar gapChar = gapValue<TMultiReadChar>();
1147
1148 typename DirectionIterator<TFile, Output>::Type target = directionIterator(file, Output());
1149
1150 typedef typename Iterator<typename TFragmentStore::TContigStore, Standard>::Type TContigIter;
1151 TContigIter contigIt = begin(fragStore.contigStore, Standard() );
1152 TContigIter contigItEnd = end(fragStore.contigStore, Standard() );
1153 for(TSize idCount = 0;contigIt != contigItEnd; ++contigIt, ++idCount) {
1154 // Alignment matrix
1155 typedef String<TMultiReadChar> TAlignMat;
1156 TAlignMat mat;
1157 TSize maxCoverage;
1158 String<TSize> readSlot;
1159 convertAlignment(fragStore, mat, idCount, maxCoverage, readSlot);
1160 TSize len = length(mat) / maxCoverage;
1161
1162 // Gapped consensus sequence
1163 typedef String<TMultiReadChar> TGappedConsensus;
1164 TGappedConsensus gappedConsensus;
1165 getGappedConsensus(fragStore, gappedConsensus, idCount);
1166
1167 // Print the alignment matrix
1168 String<TSize> coverage;
1169 resize(coverage, len, 0);
1170 typedef typename Iterator<TGappedConsensus, Standard>::Type TConsIter;
1171 TConsIter itCons = begin(gappedConsensus, Standard());
1172 TSize winSize = 60;
1173 int offset = 2;
1174 TSize column = 0;
1175 while (column<len) {
1176 TSize window_end = column + winSize;
1177 if (window_end >= len) window_end = len;
1178 // Position
1179 for(int i = 0; i<offset - 2; ++i) writeValue(target, ' ');
1180 write(target, "Pos: ");
1181 appendNumber(target, column);
1182 writeValue(target, '\n');
1183 // Ruler
1184 for(int i = 0; i<offset + 3; ++i) writeValue(target, ' ');
1185 for(TSize local_col = 1; local_col<window_end - column + 1; ++local_col) {
1186 if ((local_col % 10)==0)
1187 writeValue(target, ':');
1188 else if ((local_col % 5)==0)
1189 writeValue(target, '.');
1190 else
1191 writeValue(target, ' ');
1192 }
1193 writeValue(target, '\n');
1194 // Matrix
1195 for(TSize row = 0; row<maxCoverage; ++row) {
1196 TSize tmp = row;
1197 int off = 0;
1198 while (tmp / 10 != 0) {
1199 tmp /= 10;
1200 ++off;
1201 }
1202 for(int i = 0; i<offset - off; ++i) writeValue(target, ' ');
1203 appendNumber(target, row);
1204 writeValue(target, ':');
1205 writeValue(target, ' ');
1206 for(TSize local_col = column; local_col<window_end; ++local_col) {
1207 writeValue(target, mat[row * len + local_col]);
1208 if (mat[row * len + local_col] != '.') ++coverage[local_col];
1209 }
1210 writeValue(target, '\n');
1211 }
1212 writeValue(target, '\n');
1213
1214 // Consensus
1215 for(int i = 0; i<offset; ++i)
1216 writeValue(target, ' ');
1217 write(target, "C: ");
1218 for(unsigned int local_col = column; local_col<window_end; ++local_col, ++itCons)
1219 writeValue(target, *itCons);
1220 writeValue(target, '\n');
1221 for(int i = 0; i<offset-1; ++i)
1222 writeValue(target, ' ');
1223 write(target, ">2: ");
1224 for(unsigned int local_col = column; local_col<window_end; ++local_col) {
1225 if (coverage[local_col] > 2)
1226 writeValue(target, gappedConsensus[local_col]);
1227 else
1228 writeValue(target, gapChar);
1229 }
1230 writeValue(target, '\n');
1231 writeValue(target, '\n');
1232 column+=winSize;
1233 }
1234 writeValue(target, '\n');
1235 writeValue(target, '\n');
1236
1237 // Print all aligned reads belonging to this contig
1238
1239 // Sort according to contigId
1240 sortAlignedReads(fragStore.alignedReadStore, SortContigId());
1241
1242 // Find range of the given contig
1243 typedef typename Iterator<typename TFragmentStore::TAlignedReadStore, Standard>::Type TAlignIter;
1244 TAlignIter alignIt = lowerBoundAlignedReads(fragStore.alignedReadStore, idCount, SortContigId());
1245 TAlignIter alignItEnd = upperBoundAlignedReads(fragStore.alignedReadStore, idCount, SortContigId());
1246
1247 // Sort the reads according to the begin position
1248 sortAlignedReads(infix(fragStore.alignedReadStore, alignIt - begin(fragStore.alignedReadStore, Standard()), alignItEnd - begin(fragStore.alignedReadStore, Standard())), SortBeginPos());
1249 TAlignIter alignItTmp = lowerBoundAlignedReads(fragStore.alignedReadStore, idCount, SortContigId());
1250 TAlignIter alignItTmpEnd = upperBoundAlignedReads(fragStore.alignedReadStore, idCount, SortContigId());
1251 String<std::pair<TSize, TSize> > idToPos;
1252 reserve(idToPos, alignItTmpEnd - alignItTmp);
1253 for(TSize iCount = 0; alignItTmp!=alignItTmpEnd; ++iCount, ++alignItTmp)
1254 appendValue(idToPos, std::make_pair(alignItTmp->id, readSlot[iCount]));
1255 std::sort(begin(idToPos, Standard()), end(idToPos, Standard()));
1256
1257 // Sort the reads according to the id
1258 sortAlignedReads(infix(fragStore.alignedReadStore, alignIt - begin(fragStore.alignedReadStore, Standard()), alignItEnd - begin(fragStore.alignedReadStore, Standard())), SortId());
1259 alignIt = lowerBoundAlignedReads(fragStore.alignedReadStore, idCount, SortContigId());
1260 alignItEnd = upperBoundAlignedReads(fragStore.alignedReadStore, idCount, SortContigId());
1261
1262 bool noNamesPresent = (length(fragStore.readNameStore) == 0);
1263 for(TSize iCount = 0;alignIt != alignItEnd; ++alignIt, ++iCount) {
1264
1265 // Print all reads
1266 write(target, "typ:");
1267 if (!noNamesPresent)
1268 {
1269 writeValue(target, 'R');
1270 appendNumber(target, iCount);
1271 }
1272 else
1273 {
1274 write(target, fragStore.readNameStore[alignIt->readId]);
1275 }
1276 write(target, "\nseq:");
1277 write(target, fragStore.readSeqStore[alignIt->readId]);
1278 write(target, "\nPos:");
1279 appendNumber(target, alignIt->beginPos);
1280 writeValue(target, ',');
1281 appendNumber(target, alignIt->endPos);
1282 writeValue(target, '\n');
1283 #ifndef CELERA_OFFSET
1284 TSize begClr = 0;
1285 TSize endClr = 0;
1286 getClrRange(fragStore, *alignIt, begClr, endClr);
1287 write(target, "clr:");
1288 appendNumber(target, begClr);
1289 writeValue(target, ',');
1290 appendNumber(target, endClr);
1291 writeValue(target, '\n');
1292 #endif
1293 std::stringstream gapCoords;
1294 TSize letterCount = 0;
1295 TSize gapCount = 0;
1296 for(TContigPos column = _min(alignIt->beginPos, alignIt->endPos); column < _max(alignIt->beginPos, alignIt->endPos); ++column) {
1297 if (mat[idToPos[iCount].second * len + column] == gapChar) {
1298 ++gapCount;
1299 gapCoords << letterCount << ' ';
1300 } else ++letterCount;
1301 }
1302 write(target, "dln:");
1303 appendNumber(target, gapCount);
1304 writeValue(target, '\n');
1305 write(target, "del:");
1306 write(target, gapCoords.str());
1307 writeValue(target, '\n');
1308 writeValue(target, '\n');
1309 }
1310 }
1311 }
1312
1313 //////////////////////////////////////////////////////////////////////////////
1314 // Read simulator format: Simple fasta read file with positions
1315 //////////////////////////////////////////////////////////////////////////////
1316
1317
1318 //////////////////////////////////////////////////////////////////////////////////
1319
1320
1321 template<typename TFile, typename TSpec, typename TConfig, typename TFilePath>
1322 inline void
_convertSimpleReadFile(TFile &,FragmentStore<TSpec,TConfig> &,TFilePath &,bool)1323 _convertSimpleReadFile(TFile& /*file*/,
1324 FragmentStore<TSpec, TConfig>& /*fragStore*/,
1325 TFilePath& /*filePath*/,
1326 bool /*moveToFront*/)
1327 {
1328 return;
1329 ////IOREV _nodoc_ huge undocumented function, uses custom IO based on iostream and FILE* :S
1330 // // Basic types
1331 // typedef FragmentStore<TSpec, TConfig> TFragmentStore;
1332 // typedef typename Id<TFragmentStore>::Type TId;
1333 // typedef typename Size<TFragmentStore>::Type TSize;
1334 // //typedef typename Value<TFile>::Type TValue;
1335 // typedef typename TFragmentStore::TContigPos TPos;
1336 // typedef typename TFragmentStore::TReadSeq TReadSeq;
1337 //
1338 // // All fragment store element types
1339 // typedef typename Value<typename TFragmentStore::TContigStore>::Type TContigElement;
1340 // typedef typename Value<typename TFragmentStore::TLibraryStore>::Type TLibraryStoreElement;
1341 // typedef typename Value<typename TFragmentStore::TMatePairStore>::Type TMatePairElement;
1342 // typedef typename Value<typename TFragmentStore::TReadStore>::Type TReadStoreElement;
1343 // typedef typename Value<typename TFragmentStore::TAlignedReadStore>::Type TAlignedElement;
1344 //
1345 //
1346 // // All maps to mirror file ids to our internal ids
1347 // typedef std::map<TId, TId> TIdMap;
1348 // TIdMap libIdMap;
1349 // TIdMap frgIdMap;
1350 // TIdMap readIdMap;
1351 //
1352 // // Create RecordReader object.
1353 // typename DirectionIterator<TFile, Input>::Type reader = directionIterator(file, Input());
1354 //
1355 // // Parse the file and convert the internal ids
1356 // TPos maxPos = 0;
1357 // TPos minPos = std::numeric_limits<TPos>::max();
1358 // TId count = 0;
1359 // if (atEnd(reader))
1360 // return false;
1361 // CharString buffer;
1362 // while (!atEnd(reader))
1363 // {
1364 // // New read?
1365 // if (value(reader) == '>')
1366 // {
1367 // TAlignedElement alignEl;
1368 // TId id = count;
1369 // TId fragId = TReadStoreElement::INVALID_ID;
1370 // TId repeatId = 0;
1371 //
1372 // goNext(reader);
1373 // if (skipWhitespaces(reader) != 0)
1374 // return 1;
1375 //
1376 // // Get the layout positions
1377 // clear(buffer);
1378 // if (readDigits(buffer, reader) != 0)
1379 // return 1;
1380 // if (!lexicalCast2(alignEl.beginPos, buffer))
1381 // return 1;
1382 // goNext(reader);
1383 // if (skipWhitespaces(reader) != 0)
1384 // return 1;
1385 // clear(buffer);
1386 // if (readDigits(buffer, reader) != 0)
1387 // return 1;
1388 // if (!lexicalCast2(alignEl.endPos, buffer))
1389 // return 1;
1390 //
1391 // // Any attributes?
1392 // String<char> eid;
1393 // String<char> qlt;
1394 // TReadSeq seq;
1395 // if (value(reader) == '[')
1396 // {
1397 // String<char> fdIdentifier;
1398 // while (value(reader) != ']')
1399 // {
1400 // goNext(reader);
1401 // if (skipWhitespaces(reader) != 0)
1402 // return 1;
1403 // clear(fdIdentifier);
1404 // if (readAlphaNums(fdIdentifier, reader) != 0)
1405 // return 1;
1406 // goNext(reader); // Skip "="
1407 // if (fdIdentifier == "id")
1408 // {
1409 // clear(buffer);
1410 // if (readDigits(buffer, reader) != 0)
1411 // return 1;
1412 // if (!lexicalCast2(id, buffer))
1413 // return 1;
1414 // } else if (fdIdentifier == "fragId") {
1415 // clear(buffer);
1416 // if (readDigits(buffer, reader) != 0)
1417 // return 1;
1418 // if (!lexicalCast2(fragId, buffer))
1419 // return 1;
1420 // } else if (fdIdentifier == "repeatId") {
1421 // clear(buffer);
1422 // if (readDigits(buffer, reader) != 0)
1423 // return 1;
1424 // if (!lexicalCast2(repeatId, buffer))
1425 // return 1;
1426 // } else if (fdIdentifier == "eid") {
1427 // if (readUntilOneOf(eid, reader, ',', ']') != 0)
1428 // return 1;
1429 // } else if (fdIdentifier == "qlt") {
1430 // if (readUntilOneOf(qlt, reader, ',', ']') != 0)
1431 // return 1;
1432 // } else {
1433 // // Jump to next attribute
1434 // // TODO(holtgrew): Add skipUntilOneOf()?
1435 // if (readUntilOneOf(buffer, reader, ',', ']') != 0)
1436 // return 1;
1437 // }
1438 // }
1439 // }
1440 // if (skipLine(reader) != 0)
1441 // return 1;
1442 // if (skipWhitespaces(reader) != 0)
1443 // return 1;
1444 // while (!atEnd(reader) && value(reader) != '>')
1445 // {
1446 // if (readLetters(seq, reader) != 0)
1447 // return 1;
1448 // int res = skipWhitespaces(reader);
1449 // if (res != 0 && res != EOF_BEFORE_SUCCESS)
1450 // return 1;
1451 // }
1452 //
1453 // // Set quality
1454 // typedef typename Iterator<TReadSeq, Standard>::Type TReadIter;
1455 // typedef typename Iterator<String<char> >::Type TQualIter;
1456 // TReadIter begIt = begin(seq, Standard() );
1457 // TReadIter begItEnd = begin(seq, Standard() );
1458 // if (length(qlt)) {
1459 // TQualIter qualIt = begin(qlt);
1460 // TQualIter qualItEnd = end(qlt);
1461 // for(;qualIt != qualItEnd; goNext(qualIt), goNext(begIt)) assignQualityValue(value(begIt), value(qualIt));
1462 // } else {
1463 // for(;begIt != begItEnd; goNext(begIt)) assignQualityValue(value(begIt), 'D');
1464 // }
1465 //
1466 // // Set eid if not given
1467 // if (empty(eid)) {
1468 // std::stringstream input;
1469 // input << "R" << id;
1470 // input << "-" << repeatId;
1471 // eid = input.str().c_str();
1472 // }
1473 //
1474 // // Insert the read
1475 // readIdMap.insert(std::make_pair(id, static_cast<TId>(length(fragStore.readStore))));
1476 // appendRead(fragStore, seq, fragId);
1477 // appendValue(fragStore.readNameStore, eid, Generous());
1478 //
1479 // // Insert an aligned read
1480 // TSize readLen = length(seq);
1481 // if (alignEl.beginPos < alignEl.endPos) {
1482 // if ((TPos)readLen != alignEl.endPos - alignEl.beginPos) {
1483 // alignEl.endPos = alignEl.beginPos + readLen;
1484 // }
1485 // if (alignEl.beginPos < minPos) minPos = alignEl.beginPos;
1486 // if (alignEl.endPos > maxPos) maxPos = alignEl.endPos;
1487 // } else {
1488 // if ((TPos)readLen != alignEl.beginPos - alignEl.endPos) {
1489 // alignEl.beginPos = alignEl.endPos + readLen;
1490 // }
1491 // if (alignEl.endPos < minPos) minPos = alignEl.endPos;
1492 // if (alignEl.beginPos > maxPos) maxPos = alignEl.beginPos;
1493 // }
1494 // alignEl.readId = id;
1495 // alignEl.pairMatchId = fragId;
1496 // alignEl.contigId = 0;
1497 // alignEl.id = length(fragStore.alignedReadStore);
1498 // appendValue(fragStore.alignedReadStore, alignEl, Generous());
1499 // ++count;
1500 // } else {
1501 // if (skipLine(reader) != 0)
1502 // return 1;
1503 // }
1504 // }
1505 //
1506 // // Read contig or reference sequence
1507 // TContigElement contigEl;
1508 // std::string fileName = filePath + 'S';
1509 // SeqFileIn strmRef;
1510 // String<char> contigEid = "C0";
1511 // if (open(strmRef, fileName.c_str()))
1512 // {
1513 // typename DirectionIterator<TFile, Input>::Type readerRef = directionIterator(strmRef, Input());
1514 // clear(contigEid);
1515 // readRecord(contigEid, contigEl.seq, strmRef);
1516 // close(strmRef);
1517 // }
1518 // if (empty(contigEl.seq)) {
1519 // typedef typename TFragmentStore::TContigGapAnchor TContigGapAnchor;
1520 // if (moveToFront) appendValue(contigEl.gaps, TContigGapAnchor(0, maxPos - minPos), Generous());
1521 // else appendValue(contigEl.gaps, TContigGapAnchor(0, maxPos), Generous());
1522 // }
1523 // appendValue(fragStore.contigStore, contigEl, Generous());
1524 // appendValue(fragStore.contigNameStore, contigEid, Generous());
1525 //
1526 //
1527 // // Read fragments
1528 // fileName = filePath + 'F';
1529 // FILE* strmFrag = fopen(fileName.c_str(), "rb");
1530 // if (!strmFrag)
1531 // return 1;
1532 // RecordReader<FILE *, SinglePass<> > readerFrag(strmFrag);
1533 // while (!atEnd(readerFrag))
1534 // {
1535 // if (value(readerFrag) == '>')
1536 // {
1537 // TMatePairElement matePairEl;
1538 // goNext(readerFrag);
1539 // if (skipWhitespaces(readerFrag) != 0)
1540 // return 1;
1541 //
1542 // // Get the fragment id
1543 // clear(buffer);
1544 // if (readDigits(buffer, readerFrag) != 0)
1545 // return 1;
1546 // TId id = 0;
1547 // if (!lexicalCast2(id, buffer))
1548 // return 1;
1549 //
1550 // // Any attributes?
1551 // std::stringstream input;
1552 // input << "F" << id;
1553 // String<char> eid(input.str().c_str());
1554 // if (value(readerFrag) == '[')
1555 // {
1556 // String<char> fdIdentifier;
1557 // while (value(readerFrag) != ']')
1558 // {
1559 // goNext(readerFrag);
1560 // if (skipWhitespaces(readerFrag) != 0)
1561 // return 1;
1562 // clear(fdIdentifier);
1563 // if (readAlphaNums(fdIdentifier, readerFrag) != 0)
1564 // return 1;
1565 // goNext(readerFrag);
1566 // if (fdIdentifier == "libId")
1567 // {
1568 // clear(buffer);
1569 // if (readDigits(buffer, readerFrag) != 0)
1570 // return 1;
1571 // if (!lexicalCast2(matePairEl.libId, buffer))
1572 // return 1;
1573 // } else if (fdIdentifier == "eid") {
1574 // clear(eid);
1575 // if (readUntilOneOf(eid, readerFrag, ',', ']') != 0)
1576 // return 1;
1577 // } else {
1578 // // Jump to next attribute
1579 // if (readUntilOneOf(buffer, readerFrag, ',', ']') != 0)
1580 // return 1;
1581 // }
1582 // }
1583 // }
1584 // if (skipLine(readerFrag) != 0)
1585 // return 1;
1586 //
1587 // // Read the two reads belonging to this mate pair
1588 // clear(buffer);
1589 // for (int i = 0; i < 2; ++i)
1590 // {
1591 // if (skipWhitespaces(readerFrag) != 0)
1592 // return 1;
1593 // clear(buffer);
1594 // if (readDigits(buffer, readerFrag) != 0)
1595 // return 1;
1596 // if (!lexicalCast2(matePairEl.readId[i], buffer))
1597 // return 1;
1598 // if (!i) // Skip ','.
1599 // goNext(readerFrag);
1600 // }
1601 // int res = skipLine(readerFrag);
1602 // if (res != 0 && res != EOF_BEFORE_SUCCESS)
1603 // return 1;
1604 //
1605 // // Insert mate pair
1606 // if (matePairEl.readId[0] != matePairEl.readId[1]) {
1607 // frgIdMap.insert(std::make_pair(id, static_cast<TId>(length(fragStore.matePairStore))));
1608 // appendValue(fragStore.matePairStore, matePairEl, Generous());
1609 // appendValue(fragStore.matePairNameStore, eid, Generous());
1610 // }
1611 // } else {
1612 // int res = skipLine(readerFrag);
1613 // if (res != 0 && res != EOF_BEFORE_SUCCESS)
1614 // return 1;
1615 // }
1616 // }
1617 // fclose(strmFrag);
1618 //
1619 // // Read libraries
1620 // fileName = filePath + 'L';
1621 // FILE* strmLib = fopen(fileName.c_str(), "rb");
1622 // if (!strmLib)
1623 // return 1;
1624 // RecordReader<FILE *, SinglePass<> > readerLib(strmLib);
1625 // while (!atEnd(readerLib))
1626 // {
1627 // if (value(readerLib) == '>')
1628 // {
1629 // TLibraryStoreElement libEl;
1630 // goNext(readerLib);
1631 // if (skipWhitespaces(readerLib) != 0)
1632 // return 1;
1633 //
1634 // // Get the fragment id
1635 // clear(buffer);
1636 // if (readDigits(buffer, readerLib) != 0)
1637 // return 1;
1638 // TId id = 0;
1639 // if (!lexicalCast2(id, buffer))
1640 // return 1;
1641 //
1642 // // Any attributes?
1643 // std::stringstream input;
1644 // input << "L" << id;
1645 // String<char> eid(input.str().c_str());
1646 // if (value(readerLib) == '[')
1647 // {
1648 // String<char> fdIdentifier;
1649 // while (value(readerLib) != ']')
1650 // {
1651 // goNext(readerLib);
1652 // if (skipWhitespaces(readerLib) != 0)
1653 // return 1;
1654 // clear(fdIdentifier);
1655 // if (readAlphaNums(fdIdentifier, readerLib) != 0)
1656 // return 1;
1657 // if (fdIdentifier == "eid")
1658 // {
1659 // clear(eid);
1660 // if (readUntilOneOf(eid, readerLib, ',', ']') != 0)
1661 // return 1;
1662 // } else {
1663 // // Jump to next attribute
1664 // // TODO(holtgrew): skipUntilOneOf()?
1665 // if (readUntilOneOf(buffer, readerLib, ',', ']') != 0)
1666 // return 1;
1667 // }
1668 // }
1669 // }
1670 // if (skipLine(readerLib) != 0)
1671 // return 1;
1672 // if (skipWhitespaces(readerLib) != 0)
1673 // return 1;
1674 //
1675 // // Read the mean and standard deviation
1676 // clear(buffer);
1677 // if (readDigits(buffer, readerLib) != 0)
1678 // return 1;
1679 // if (!lexicalCast2(libEl.mean, buffer))
1680 // return 1;
1681 // if (skipWhitespaces(readerLib) != 0)
1682 // return 1;
1683 // goNext(readerLib);
1684 // clear(buffer);
1685 // if (readDigits(buffer, readerLib) != 0)
1686 // return 1;
1687 // if (!lexicalCast2(libEl.std, buffer))
1688 // return 1;
1689 // int res = skipLine(readerLib);
1690 // if (res != 0 && res != EOF_BEFORE_SUCCESS)
1691 // return 1;
1692 //
1693 // // Insert mate pair
1694 // libIdMap.insert(std::make_pair(id, static_cast<TId>(length(fragStore.libraryStore))));
1695 // appendValue(fragStore.libraryStore, libEl, Generous());
1696 // appendValue(fragStore.libraryNameStore, eid, Generous());
1697 // } else {
1698 // int res = skipLine(readerLib);
1699 // if (res != 0 && res != EOF_BEFORE_SUCCESS)
1700 // return 1;
1701 // }
1702 // }
1703 // fclose(strmLib);
1704 //
1705 // // Renumber all ids
1706 // typedef typename TIdMap::const_iterator TIdMapIter;
1707 // typedef typename Iterator<typename TFragmentStore::TMatePairStore>::Type TMateIter;
1708 // TMateIter mateIt = begin(fragStore.matePairStore);
1709 // TMateIter mateItEnd = end(fragStore.matePairStore);
1710 // for(;mateIt != mateItEnd; goNext(mateIt)) {
1711 // if (mateIt->libId != TMatePairElement::INVALID_ID) {
1712 // TIdMapIter libIdPos = libIdMap.find(mateIt->libId);
1713 // if (libIdPos != libIdMap.end()) mateIt->libId = libIdPos->second;
1714 // else mateIt->libId = TMatePairElement::INVALID_ID;
1715 // }
1716 // if (mateIt->readId[0] != TMatePairElement::INVALID_ID) {
1717 // TIdMapIter readIdPos = readIdMap.find(mateIt->readId[0]);
1718 // if (readIdPos != readIdMap.end()) mateIt->readId[0] = readIdPos->second;
1719 // else mateIt->readId[0] = TMatePairElement::INVALID_ID;
1720 // }
1721 // if (mateIt->readId[1]!= TMatePairElement::INVALID_ID) {
1722 // TIdMapIter readIdPos = readIdMap.find(mateIt->readId[1]);
1723 // if (readIdPos != readIdMap.end()) mateIt->readId[1] = readIdPos->second;
1724 // else mateIt->readId[0] = TMatePairElement::INVALID_ID;
1725 // }
1726 // }
1727 // typedef typename Iterator<typename TFragmentStore::TReadStore>::Type TReadIter;
1728 // TReadIter readIt = begin(fragStore.readStore);
1729 // TReadIter readItEnd = end(fragStore.readStore);
1730 // for(;readIt != readItEnd; goNext(readIt)) {
1731 // if (readIt->matePairId != TReadStoreElement::INVALID_ID) {
1732 // TIdMapIter mateIdPos = frgIdMap.find(readIt->matePairId);
1733 // if (mateIdPos != frgIdMap.end()) readIt->matePairId = mateIdPos->second;
1734 // else readIt->matePairId = TReadStoreElement::INVALID_ID;
1735 // }
1736 // }
1737 // typedef typename Iterator<typename TFragmentStore::TAlignedReadStore>::Type TAlignIter;
1738 // TAlignIter alignIt = begin(fragStore.alignedReadStore);
1739 // TAlignIter alignItEnd = end(fragStore.alignedReadStore);
1740 // for(;alignIt != alignItEnd; goNext(alignIt)) {
1741 // if (alignIt->readId != TAlignedElement::INVALID_ID) {
1742 // TIdMapIter readIdPos = readIdMap.find(alignIt->readId);
1743 // if (readIdPos != readIdMap.end()) alignIt->readId = readIdPos->second;
1744 // else alignIt->readId = TAlignedElement::INVALID_ID;
1745 // }
1746 // if (moveToFront) {
1747 // alignIt->beginPos -= minPos;
1748 // alignIt->endPos -= minPos;
1749 // }
1750 // }
1751 // return 0;
1752 }
1753
1754
1755 //////////////////////////////////////////////////////////////////////////////
1756 // Rudimentary write functions for CeleraFrg and Celera Cgb
1757 //////////////////////////////////////////////////////////////////////////////
1758
1759 template<typename TFile, typename TSpec, typename TConfig>
1760 inline void
_writeCeleraFrg(TFile & file,FragmentStore<TSpec,TConfig> & fragStore)1761 _writeCeleraFrg(TFile& file,
1762 FragmentStore<TSpec, TConfig>& fragStore)
1763 {
1764 //IOREV _nodoc_
1765 typedef FragmentStore<TSpec, TConfig> TFragmentStore;
1766 typedef typename Size<TFragmentStore>::Type TSize;
1767 typedef typename TFragmentStore::TReadPos TReadPos;
1768
1769 typename DirectionIterator<TFile, Output>::Type target = directionIterator(file, Output());
1770
1771 // Iterate over all aligned reads to get the clear ranges
1772 typedef Pair<TReadPos, TReadPos> TClrRange;
1773 String<TClrRange> clearStr;
1774 resize(clearStr, length(fragStore.readStore));
1775 typedef typename Iterator<typename TFragmentStore::TAlignedReadStore>::Type TAlignIter;
1776 TAlignIter alignIt = begin(fragStore.alignedReadStore);
1777 TAlignIter alignItEnd = end(fragStore.alignedReadStore);
1778 for(;alignIt != alignItEnd; goNext(alignIt))
1779 {
1780 TReadPos begClr = 0;
1781 TReadPos endClr = 0;
1782 getClrRange(fragStore, value(alignIt), begClr, endClr);
1783 value(clearStr, alignIt->readId) = TClrRange(begClr, endClr);
1784 }
1785
1786 // Write Reads
1787 typedef typename Iterator<typename TFragmentStore::TReadStore>::Type TReadIter;
1788 TReadIter readIt = begin(fragStore.readStore);
1789 TReadIter readItEnd = end(fragStore.readStore);
1790 bool noNamesPresent = (length(fragStore.readNameStore) == 0);
1791 for (TSize idCount = 0; readIt != readItEnd; goNext(readIt), ++idCount)
1792 {
1793 write(target, "{FRG\nact:A\nacc:");
1794 appendNumber(target, idCount + 1);
1795 write(target, "\ntyp:R\n");
1796 if (!noNamesPresent)
1797 {
1798 write(target, "src:\n");
1799 write(target, fragStore.readNameStore[idCount]);
1800 write(target, "\n.\n");
1801 }
1802 write(target, "etm:0\nseq:\n");
1803 writeWrappedString(target, fragStore.readSeqStore[idCount], 70);
1804 write(target, ".\nqlt:\n");
1805 typedef typename Value<typename TFragmentStore::TReadSeqStore>::Type TReadSeq;
1806 typedef QualityExtractor<typename Value<TReadSeq>::Type> TQualityExtractor;
1807 ModifiedString<TReadSeq const, ModView<TQualityExtractor> > quals(fragStore.readSeqStore[idCount]);
1808 writeWrappedString(target, quals, 70);
1809 // Note: Clear range does not have to be ordered, e.g. no indication for reverse complemented reads, this is happening in cgb records
1810 write(target, ".\nclr:");
1811 appendNumber(target, clearStr[idCount].i1);
1812 writeValue(target, ',');
1813 appendNumber(target, clearStr[idCount].i2);
1814 write(target, "\n}\n");
1815 }
1816 }
1817
1818
1819 //////////////////////////////////////////////////////////////////////////////
1820
1821 // Writes out the first contig only.
1822
1823 // TODO(holtgrew): This is only used in seqcons, should it go into app?
1824 template<typename TFile, typename TSpec, typename TConfig>
1825 inline void
_writeCeleraCgb(TFile & file,FragmentStore<TSpec,TConfig> & fragStore)1826 _writeCeleraCgb(TFile& file,
1827 FragmentStore<TSpec, TConfig>& fragStore)
1828 {
1829 typedef FragmentStore<TSpec, TConfig> TFragmentStore;
1830 typedef typename Size<TFragmentStore>::Type TSize;
1831 typedef typename Id<TFragmentStore>::Type TId;
1832 //typedef typename TFragmentStore::TReadPos TReadPos;
1833
1834 typename DirectionIterator<TFile, Output>::Type target = directionIterator(file, Output());
1835
1836 // Write the first contig
1837 TId contigId = 0;
1838
1839 // Sort the reads according to position
1840 sortAlignedReads(fragStore.alignedReadStore, SortBeginPos());
1841
1842 // Write Header
1843 write(target, "{IUM\nacc:0\nsrc:\ngen> @@ [0,0]\n.\ncov:0.000\nsta:X\nfur:X\nabp:0\nbbp:0\nlen:");
1844 appendNumber(target, length((value(fragStore.contigStore, contigId)).seq));
1845 write(target, "\ncns:\n.\nqlt:\n.\nfor:0\nnfr:");
1846 appendNumber(target, length(fragStore.readStore));
1847 writeValue(target, '\n');
1848
1849 // Write reads
1850 typedef typename Iterator<typename TFragmentStore::TAlignedReadStore>::Type TAlignIter;
1851 TAlignIter alignIt = begin(fragStore.alignedReadStore);
1852 TAlignIter alignItEnd = end(fragStore.alignedReadStore);
1853 TSize offsetLeft = _min(alignIt->beginPos, alignIt->endPos);
1854 for (;alignIt != alignItEnd; goNext(alignIt))
1855 {
1856 if (contigId != alignIt->contigId)
1857 continue;
1858 write(target, "{IMP\ntyp:R\nmid:");
1859 appendNumber(target, alignIt->readId + 1);
1860 write(target, "\ncon:0\npos:");
1861 appendNumber(target, alignIt->beginPos - offsetLeft);
1862 writeValue(target, ',');
1863 appendNumber(target, alignIt->endPos - offsetLeft);
1864 write(target, "\ndln:0\ndel:\n}\n");
1865 }
1866 write(target, "}\n");
1867 }
1868
1869 } // namespace seqan
1870
1871 #endif //#ifndef SEQAN_HEADER_...
1872