1 /*=========================================================================
2   Copyright (C) 2009 by Stephan Aiche
3 
4   This program is free software; you can redistribute it and/or
5   modify it under the terms of the GNU Lesser General Public
6   License as published by the Free Software Foundation; either
7   version 3 of the License, or (at your options) any later version.
8 
9   This program is distributed in the hope that it will be useful,
10   but WITHOUT ANY WARRANTY; without even the implied warranty of
11   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12   Lesser General Public License for more details.
13 
14   You should have received a copy of the GNU General Public License
15   along with this program.  If not, see <http://www.gnu.org/licenses/>.
16  ==========================================================================
17   $Id: assembly_parser.h 8418 2010-12-21 15:15:23Z holtgrew $
18  ==========================================================================*/
19 
20 #ifndef REPSEP_HEADER_ASSEMBLY_PARSER_H
21 #define REPSEP_HEADER_ASSEMBLY_PARSER_H
22 
23 #include <seqan/store.h>
24 #include <seqan/modifier.h>
25 
26 #include <map>
27 
28 //#define REPSEP_DEBUG_ASSEMBLY_COL_PARSER
29 
30 using namespace seqan;
31 
32 template<typename TGapsAnchorArray, typename TUngappedSequence, typename TReadPos, typename TGappedSequence>
33 inline void
_gap_sequence(TGapsAnchorArray const & gaps,TUngappedSequence const & ungapped_sequence,TReadPos beginClr,TReadPos endClr,TGappedSequence & seq)34 _gap_sequence(TGapsAnchorArray const & gaps,
35              TUngappedSequence const & ungapped_sequence,
36              TReadPos beginClr,
37              TReadPos endClr,
38              TGappedSequence & seq)
39 {
40     clear(seq);
41     typedef typename Size<TGapsAnchorArray>::Type TSize;
42     typedef typename Value<TGappedSequence>::Type TGappedValue;
43 
44     TSize ungapped_position = beginClr;
45     TSize gapped_position = 0;
46 
47     for(TSize g = 0; g < length(gaps) ; ++g) {
48         while (static_cast<TReadPos>(ungapped_position) < gaps[g].seqPos && static_cast<TReadPos>(ungapped_position) < endClr) {
49             // put
50             append(seq, TGappedValue(ungapped_sequence[ungapped_position] , ungapped_position));
51             ++ungapped_position;
52             ++gapped_position;
53         }
54         while (static_cast<TReadPos>(gapped_position) < gaps[g].gapPos) {
55             append(seq, TGappedValue('-', endClr + 1));
56             ++gapped_position;
57         }
58     }
59 
60     // fill up the rest
61     while (static_cast<TReadPos>(ungapped_position) < endClr) {
62         append(seq, TGappedValue(ungapped_sequence[ungapped_position] , ungapped_position));
63         ++ungapped_position;
64     }
65 }
66 
67 
68 // build alignment matrix based on Amos-Assembly -- the main worker
69 template<typename TSpec, typename TConfig, typename TId, typename TAnnotatedCandidateColumn, typename TScannerType>
70 inline void
parseContig(FragmentStore<TSpec,TConfig> const & fragStore,TId const contigId,String<TAnnotatedCandidateColumn> & candidates,TScannerType const algo_spec)71 parseContig(FragmentStore<TSpec, TConfig> const& fragStore,
72            TId const contigId,
73            String<TAnnotatedCandidateColumn> & candidates,
74            TScannerType const algo_spec)
75 
76 {
77     clear(candidates);
78 
79     typedef FragmentStore<TSpec, TConfig> TFragmentStore;
80     typedef typename Size<TFragmentStore>::Type TSize;
81     typedef typename TFragmentStore::TContigPos TContigPos;
82     typedef typename TFragmentStore::TReadPos TReadPos;
83 
84     typedef typename Value<TAnnotatedCandidateColumn, 2>::Type TCandidateColumn;
85 
86     // must be triplet
87     typedef typename Value<TCandidateColumn>::Type TAssignedReadChar;
88 
89 	// All fragment store element types
90     typedef typename Value<typename TFragmentStore::TReadStore>::Type TReadStoreElement;
91     typedef typename Value<typename TFragmentStore::TAlignedReadStore>::Type TAlignedElement;
92 
93     //
94     typedef Pair<char, TReadPos> TGappedSequenceChar;
95     typedef String<TGappedSequenceChar> TGappedSequence;
96 
97     // Sort by begin position ..
98     sortAlignedReads(fragStore.alignedReadStore, SortBeginPos());
99 	// Sort aligned reads according to contig id
100     sortAlignedReads(fragStore.alignedReadStore, SortContigId());
101 
102     TGappedSequence gapped_consensus;
103 
104     _gap_sequence(fragStore.contigStore[contigId].gaps, fragStore.contigStore[contigId].seq, (TReadPos) 0 , (TReadPos) length(fragStore.contigStore[contigId].seq), gapped_consensus);
105     TSize l_gapped_consensus = length(gapped_consensus);
106 
107     typedef typename Iterator<typename TFragmentStore::TAlignedReadStore>::Type TAlignIter;
108     TAlignIter alignIt = lowerBoundAlignedReads(fragStore.alignedReadStore, contigId, SortContigId());
109     TAlignIter alignItEnd = upperBoundAlignedReads(fragStore.alignedReadStore, contigId, SortContigId());
110 
111     typedef std::multiset< TAlignedElement , _LessAlignedRead< TAlignedElement, SortEndPos > > TAlignedReadSet;
112 
113     typedef typename TAlignedReadSet::iterator TAlignedReadSetIter;
114 
115     TAlignedReadSet current_read_set;
116 #ifdef REPSEP_DEBUG_ASSEMBLY_COL_PARSER // debug output of all reads in this contig
117     TAlignIter debug_alignIt = lowerBoundAlignedReads(fragStore.alignedReadStore, contigId, SortContigId());
118     TAlignIter debug_alignItEnd = upperBoundAlignedReads(fragStore.alignedReadStore, contigId, SortContigId());
119 
120     while(debug_alignIt != debug_alignItEnd)
121     {
122       cout << debug_alignIt->readId << " (" << fragStore.readNameStore[debug_alignIt->readId] << ")" << endl;
123       goNext(debug_alignIt);
124     }
125 
126 #endif
127     for(TSize p = 0 ; p < l_gapped_consensus ; ++p)
128     {
129 #ifdef REPSEP_DEBUG_ASSEMBLY_COL_PARSER
130         cout << "Start inspecting column " << p << endl;
131 #endif
132         // check if we need to remove some of the reads
133         TAlignedReadSetIter end_erase = current_read_set.begin();
134         TAlignedReadSetIter set_end = current_read_set.end();
135         if(end_erase != set_end && _max(end_erase->beginPos, end_erase->endPos) == static_cast<TContigPos>(p)) {
136             // we need to remove some of our AlignedReads
137 
138             TAlignedReadSetIter start_erase = end_erase;
139 
140             while (end_erase != set_end && _max(end_erase->beginPos, end_erase->endPos) == static_cast<TContigPos>(p)) {
141 #ifdef REPSEP_DEBUG_ASSEMBLY_COL_PARSER
142                 cout << "-> will remove: " << end_erase->readId << " (" << fragStore.readNameStore[end_erase->readId] << ")" << endl;
143 #endif
144                 ++end_erase;
145             }
146             // do we need to go to previous ???
147 
148             current_read_set.erase(start_erase, end_erase);
149         }
150 
151         // add all open reads into set
152         while(alignIt != alignItEnd && _min(alignIt->beginPos, alignIt->endPos) == static_cast<TContigPos>(p)) {
153             // add
154             current_read_set.insert(*alignIt);
155 #ifdef REPSEP_DEBUG_ASSEMBLY_COL_PARSER
156             cout << "added" << endl;
157             cout << alignIt->readId << " (" << fragStore.readNameStore[alignIt->readId] << ")" << endl;
158 #endif
159             goNext(alignIt);
160         }
161 #ifdef REPSEP_DEBUG_ASSEMBLY_COL_PARSER
162         cout << "after add cycle iterator points to " << endl;
163         cout << alignIt->readId << " (" << fragStore.readNameStore[alignIt->readId] << ")" << endl;
164 #endif
165         // since also a gap can be off interest we also inspect those
166         TCandidateColumn column;
167 
168         // inspect consensus
169         TAlignedReadSetIter iter_cr = current_read_set.begin();
170         TAlignedReadSetIter iter_cr_end = current_read_set.end();
171 
172 #ifdef REPSEP_DEBUG_ASSEMBLY_COL_PARSER
173         cout << "at position " << p << " following reads are active (" << current_read_set.size() << "):" << endl;
174 #endif
175         while(iter_cr != iter_cr_end) {
176 #ifdef REPSEP_DEBUG_ASSEMBLY_COL_PARSER
177             cout << iter_cr->readId << " (" << fragStore.readNameStore[iter_cr->readId] << ")" << endl;
178 #endif
179             // get position in sequence
180             TReadPos offset = _min(iter_cr->beginPos, iter_cr->endPos);
181             TGappedSequence gapped_read;
182 
183 	        TReadPos begClr = 0;
184 	        TReadPos endClr = 0;
185 	        getClrRange(fragStore, *iter_cr, begClr, endClr);
186             String<Dna5Q> ungapped_seq = fragStore.readSeqStore[iter_cr->readId];
187             if(iter_cr->beginPos > iter_cr->endPos) {
188                 reverseComplement(ungapped_seq);
189                 // swap the clr's
190 	            TReadPos tmp = begClr;
191 	            begClr = length(ungapped_seq) - endClr;
192 	            endClr = length(ungapped_seq) - tmp;
193             }
194             _gap_sequence(iter_cr->gaps, ungapped_seq, begClr, endClr, gapped_read);
195 
196             // increase vote
197             append(column, TAssignedReadChar(_sequenceCharacter(gapped_read[p - offset]), iter_cr->readId, _positionInRead(gapped_read[p - offset])));
198 
199             ++iter_cr;
200         }
201 
202         // inspect column and mark as candidate ..
203         if( isCandidate(fragStore, contigId, gapped_consensus[p], column, algo_spec ) )
204         {
205             appendValue(candidates, TAnnotatedCandidateColumn(p, column));
206         }
207     }
208 }
209 
210 
211 
212 
213 #endif
214