/*========================================================================= Copyright (C) 2009 by Stephan Aiche This program is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your options) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . ========================================================================== $Id: assembly_parser.h 8418 2010-12-21 15:15:23Z holtgrew $ ==========================================================================*/ #ifndef REPSEP_HEADER_ASSEMBLY_PARSER_H #define REPSEP_HEADER_ASSEMBLY_PARSER_H #include #include #include //#define REPSEP_DEBUG_ASSEMBLY_COL_PARSER using namespace seqan; template inline void _gap_sequence(TGapsAnchorArray const & gaps, TUngappedSequence const & ungapped_sequence, TReadPos beginClr, TReadPos endClr, TGappedSequence & seq) { clear(seq); typedef typename Size::Type TSize; typedef typename Value::Type TGappedValue; TSize ungapped_position = beginClr; TSize gapped_position = 0; for(TSize g = 0; g < length(gaps) ; ++g) { while (static_cast(ungapped_position) < gaps[g].seqPos && static_cast(ungapped_position) < endClr) { // put append(seq, TGappedValue(ungapped_sequence[ungapped_position] , ungapped_position)); ++ungapped_position; ++gapped_position; } while (static_cast(gapped_position) < gaps[g].gapPos) { append(seq, TGappedValue('-', endClr + 1)); ++gapped_position; } } // fill up the rest while (static_cast(ungapped_position) < endClr) { append(seq, TGappedValue(ungapped_sequence[ungapped_position] , ungapped_position)); ++ungapped_position; } } // build alignment matrix based on Amos-Assembly -- the main worker template inline void parseContig(FragmentStore const& fragStore, TId const contigId, String & candidates, TScannerType const algo_spec) { clear(candidates); typedef FragmentStore TFragmentStore; typedef typename Size::Type TSize; typedef typename TFragmentStore::TContigPos TContigPos; typedef typename TFragmentStore::TReadPos TReadPos; typedef typename Value::Type TCandidateColumn; // must be triplet typedef typename Value::Type TAssignedReadChar; // All fragment store element types typedef typename Value::Type TReadStoreElement; typedef typename Value::Type TAlignedElement; // typedef Pair TGappedSequenceChar; typedef String TGappedSequence; // Sort by begin position .. sortAlignedReads(fragStore.alignedReadStore, SortBeginPos()); // Sort aligned reads according to contig id sortAlignedReads(fragStore.alignedReadStore, SortContigId()); TGappedSequence gapped_consensus; _gap_sequence(fragStore.contigStore[contigId].gaps, fragStore.contigStore[contigId].seq, (TReadPos) 0 , (TReadPos) length(fragStore.contigStore[contigId].seq), gapped_consensus); TSize l_gapped_consensus = length(gapped_consensus); typedef typename Iterator::Type TAlignIter; TAlignIter alignIt = lowerBoundAlignedReads(fragStore.alignedReadStore, contigId, SortContigId()); TAlignIter alignItEnd = upperBoundAlignedReads(fragStore.alignedReadStore, contigId, SortContigId()); typedef std::multiset< TAlignedElement , _LessAlignedRead< TAlignedElement, SortEndPos > > TAlignedReadSet; typedef typename TAlignedReadSet::iterator TAlignedReadSetIter; TAlignedReadSet current_read_set; #ifdef REPSEP_DEBUG_ASSEMBLY_COL_PARSER // debug output of all reads in this contig TAlignIter debug_alignIt = lowerBoundAlignedReads(fragStore.alignedReadStore, contigId, SortContigId()); TAlignIter debug_alignItEnd = upperBoundAlignedReads(fragStore.alignedReadStore, contigId, SortContigId()); while(debug_alignIt != debug_alignItEnd) { cout << debug_alignIt->readId << " (" << fragStore.readNameStore[debug_alignIt->readId] << ")" << endl; goNext(debug_alignIt); } #endif for(TSize p = 0 ; p < l_gapped_consensus ; ++p) { #ifdef REPSEP_DEBUG_ASSEMBLY_COL_PARSER cout << "Start inspecting column " << p << endl; #endif // check if we need to remove some of the reads TAlignedReadSetIter end_erase = current_read_set.begin(); TAlignedReadSetIter set_end = current_read_set.end(); if(end_erase != set_end && _max(end_erase->beginPos, end_erase->endPos) == static_cast(p)) { // we need to remove some of our AlignedReads TAlignedReadSetIter start_erase = end_erase; while (end_erase != set_end && _max(end_erase->beginPos, end_erase->endPos) == static_cast(p)) { #ifdef REPSEP_DEBUG_ASSEMBLY_COL_PARSER cout << "-> will remove: " << end_erase->readId << " (" << fragStore.readNameStore[end_erase->readId] << ")" << endl; #endif ++end_erase; } // do we need to go to previous ??? current_read_set.erase(start_erase, end_erase); } // add all open reads into set while(alignIt != alignItEnd && _min(alignIt->beginPos, alignIt->endPos) == static_cast(p)) { // add current_read_set.insert(*alignIt); #ifdef REPSEP_DEBUG_ASSEMBLY_COL_PARSER cout << "added" << endl; cout << alignIt->readId << " (" << fragStore.readNameStore[alignIt->readId] << ")" << endl; #endif goNext(alignIt); } #ifdef REPSEP_DEBUG_ASSEMBLY_COL_PARSER cout << "after add cycle iterator points to " << endl; cout << alignIt->readId << " (" << fragStore.readNameStore[alignIt->readId] << ")" << endl; #endif // since also a gap can be off interest we also inspect those TCandidateColumn column; // inspect consensus TAlignedReadSetIter iter_cr = current_read_set.begin(); TAlignedReadSetIter iter_cr_end = current_read_set.end(); #ifdef REPSEP_DEBUG_ASSEMBLY_COL_PARSER cout << "at position " << p << " following reads are active (" << current_read_set.size() << "):" << endl; #endif while(iter_cr != iter_cr_end) { #ifdef REPSEP_DEBUG_ASSEMBLY_COL_PARSER cout << iter_cr->readId << " (" << fragStore.readNameStore[iter_cr->readId] << ")" << endl; #endif // get position in sequence TReadPos offset = _min(iter_cr->beginPos, iter_cr->endPos); TGappedSequence gapped_read; TReadPos begClr = 0; TReadPos endClr = 0; getClrRange(fragStore, *iter_cr, begClr, endClr); String ungapped_seq = fragStore.readSeqStore[iter_cr->readId]; if(iter_cr->beginPos > iter_cr->endPos) { reverseComplement(ungapped_seq); // swap the clr's TReadPos tmp = begClr; begClr = length(ungapped_seq) - endClr; endClr = length(ungapped_seq) - tmp; } _gap_sequence(iter_cr->gaps, ungapped_seq, begClr, endClr, gapped_read); // increase vote append(column, TAssignedReadChar(_sequenceCharacter(gapped_read[p - offset]), iter_cr->readId, _positionInRead(gapped_read[p - offset]))); ++iter_cr; } // inspect column and mark as candidate .. if( isCandidate(fragStore, contigId, gapped_consensus[p], column, algo_spec ) ) { appendValue(candidates, TAnnotatedCandidateColumn(p, column)); } } } #endif