1 /*=========================================================================
2 Copyright (C) 2009 by Stephan Aiche
3
4 This program is free software; you can redistribute it and/or
5 modify it under the terms of the GNU Lesser General Public
6 License as published by the Free Software Foundation; either
7 version 3 of the License, or (at your options) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 Lesser General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>.
16 ==========================================================================
17 $Id: assembly_parser.h 8418 2010-12-21 15:15:23Z holtgrew $
18 ==========================================================================*/
19
20 #ifndef REPSEP_HEADER_ASSEMBLY_PARSER_H
21 #define REPSEP_HEADER_ASSEMBLY_PARSER_H
22
23 #include <seqan/store.h>
24 #include <seqan/modifier.h>
25
26 #include <map>
27
28 //#define REPSEP_DEBUG_ASSEMBLY_COL_PARSER
29
30 using namespace seqan;
31
32 template<typename TGapsAnchorArray, typename TUngappedSequence, typename TReadPos, typename TGappedSequence>
33 inline void
_gap_sequence(TGapsAnchorArray const & gaps,TUngappedSequence const & ungapped_sequence,TReadPos beginClr,TReadPos endClr,TGappedSequence & seq)34 _gap_sequence(TGapsAnchorArray const & gaps,
35 TUngappedSequence const & ungapped_sequence,
36 TReadPos beginClr,
37 TReadPos endClr,
38 TGappedSequence & seq)
39 {
40 clear(seq);
41 typedef typename Size<TGapsAnchorArray>::Type TSize;
42 typedef typename Value<TGappedSequence>::Type TGappedValue;
43
44 TSize ungapped_position = beginClr;
45 TSize gapped_position = 0;
46
47 for(TSize g = 0; g < length(gaps) ; ++g) {
48 while (static_cast<TReadPos>(ungapped_position) < gaps[g].seqPos && static_cast<TReadPos>(ungapped_position) < endClr) {
49 // put
50 append(seq, TGappedValue(ungapped_sequence[ungapped_position] , ungapped_position));
51 ++ungapped_position;
52 ++gapped_position;
53 }
54 while (static_cast<TReadPos>(gapped_position) < gaps[g].gapPos) {
55 append(seq, TGappedValue('-', endClr + 1));
56 ++gapped_position;
57 }
58 }
59
60 // fill up the rest
61 while (static_cast<TReadPos>(ungapped_position) < endClr) {
62 append(seq, TGappedValue(ungapped_sequence[ungapped_position] , ungapped_position));
63 ++ungapped_position;
64 }
65 }
66
67
68 // build alignment matrix based on Amos-Assembly -- the main worker
69 template<typename TSpec, typename TConfig, typename TId, typename TAnnotatedCandidateColumn, typename TScannerType>
70 inline void
parseContig(FragmentStore<TSpec,TConfig> const & fragStore,TId const contigId,String<TAnnotatedCandidateColumn> & candidates,TScannerType const algo_spec)71 parseContig(FragmentStore<TSpec, TConfig> const& fragStore,
72 TId const contigId,
73 String<TAnnotatedCandidateColumn> & candidates,
74 TScannerType const algo_spec)
75
76 {
77 clear(candidates);
78
79 typedef FragmentStore<TSpec, TConfig> TFragmentStore;
80 typedef typename Size<TFragmentStore>::Type TSize;
81 typedef typename TFragmentStore::TContigPos TContigPos;
82 typedef typename TFragmentStore::TReadPos TReadPos;
83
84 typedef typename Value<TAnnotatedCandidateColumn, 2>::Type TCandidateColumn;
85
86 // must be triplet
87 typedef typename Value<TCandidateColumn>::Type TAssignedReadChar;
88
89 // All fragment store element types
90 typedef typename Value<typename TFragmentStore::TReadStore>::Type TReadStoreElement;
91 typedef typename Value<typename TFragmentStore::TAlignedReadStore>::Type TAlignedElement;
92
93 //
94 typedef Pair<char, TReadPos> TGappedSequenceChar;
95 typedef String<TGappedSequenceChar> TGappedSequence;
96
97 // Sort by begin position ..
98 sortAlignedReads(fragStore.alignedReadStore, SortBeginPos());
99 // Sort aligned reads according to contig id
100 sortAlignedReads(fragStore.alignedReadStore, SortContigId());
101
102 TGappedSequence gapped_consensus;
103
104 _gap_sequence(fragStore.contigStore[contigId].gaps, fragStore.contigStore[contigId].seq, (TReadPos) 0 , (TReadPos) length(fragStore.contigStore[contigId].seq), gapped_consensus);
105 TSize l_gapped_consensus = length(gapped_consensus);
106
107 typedef typename Iterator<typename TFragmentStore::TAlignedReadStore>::Type TAlignIter;
108 TAlignIter alignIt = lowerBoundAlignedReads(fragStore.alignedReadStore, contigId, SortContigId());
109 TAlignIter alignItEnd = upperBoundAlignedReads(fragStore.alignedReadStore, contigId, SortContigId());
110
111 typedef std::multiset< TAlignedElement , _LessAlignedRead< TAlignedElement, SortEndPos > > TAlignedReadSet;
112
113 typedef typename TAlignedReadSet::iterator TAlignedReadSetIter;
114
115 TAlignedReadSet current_read_set;
116 #ifdef REPSEP_DEBUG_ASSEMBLY_COL_PARSER // debug output of all reads in this contig
117 TAlignIter debug_alignIt = lowerBoundAlignedReads(fragStore.alignedReadStore, contigId, SortContigId());
118 TAlignIter debug_alignItEnd = upperBoundAlignedReads(fragStore.alignedReadStore, contigId, SortContigId());
119
120 while(debug_alignIt != debug_alignItEnd)
121 {
122 cout << debug_alignIt->readId << " (" << fragStore.readNameStore[debug_alignIt->readId] << ")" << endl;
123 goNext(debug_alignIt);
124 }
125
126 #endif
127 for(TSize p = 0 ; p < l_gapped_consensus ; ++p)
128 {
129 #ifdef REPSEP_DEBUG_ASSEMBLY_COL_PARSER
130 cout << "Start inspecting column " << p << endl;
131 #endif
132 // check if we need to remove some of the reads
133 TAlignedReadSetIter end_erase = current_read_set.begin();
134 TAlignedReadSetIter set_end = current_read_set.end();
135 if(end_erase != set_end && _max(end_erase->beginPos, end_erase->endPos) == static_cast<TContigPos>(p)) {
136 // we need to remove some of our AlignedReads
137
138 TAlignedReadSetIter start_erase = end_erase;
139
140 while (end_erase != set_end && _max(end_erase->beginPos, end_erase->endPos) == static_cast<TContigPos>(p)) {
141 #ifdef REPSEP_DEBUG_ASSEMBLY_COL_PARSER
142 cout << "-> will remove: " << end_erase->readId << " (" << fragStore.readNameStore[end_erase->readId] << ")" << endl;
143 #endif
144 ++end_erase;
145 }
146 // do we need to go to previous ???
147
148 current_read_set.erase(start_erase, end_erase);
149 }
150
151 // add all open reads into set
152 while(alignIt != alignItEnd && _min(alignIt->beginPos, alignIt->endPos) == static_cast<TContigPos>(p)) {
153 // add
154 current_read_set.insert(*alignIt);
155 #ifdef REPSEP_DEBUG_ASSEMBLY_COL_PARSER
156 cout << "added" << endl;
157 cout << alignIt->readId << " (" << fragStore.readNameStore[alignIt->readId] << ")" << endl;
158 #endif
159 goNext(alignIt);
160 }
161 #ifdef REPSEP_DEBUG_ASSEMBLY_COL_PARSER
162 cout << "after add cycle iterator points to " << endl;
163 cout << alignIt->readId << " (" << fragStore.readNameStore[alignIt->readId] << ")" << endl;
164 #endif
165 // since also a gap can be off interest we also inspect those
166 TCandidateColumn column;
167
168 // inspect consensus
169 TAlignedReadSetIter iter_cr = current_read_set.begin();
170 TAlignedReadSetIter iter_cr_end = current_read_set.end();
171
172 #ifdef REPSEP_DEBUG_ASSEMBLY_COL_PARSER
173 cout << "at position " << p << " following reads are active (" << current_read_set.size() << "):" << endl;
174 #endif
175 while(iter_cr != iter_cr_end) {
176 #ifdef REPSEP_DEBUG_ASSEMBLY_COL_PARSER
177 cout << iter_cr->readId << " (" << fragStore.readNameStore[iter_cr->readId] << ")" << endl;
178 #endif
179 // get position in sequence
180 TReadPos offset = _min(iter_cr->beginPos, iter_cr->endPos);
181 TGappedSequence gapped_read;
182
183 TReadPos begClr = 0;
184 TReadPos endClr = 0;
185 getClrRange(fragStore, *iter_cr, begClr, endClr);
186 String<Dna5Q> ungapped_seq = fragStore.readSeqStore[iter_cr->readId];
187 if(iter_cr->beginPos > iter_cr->endPos) {
188 reverseComplement(ungapped_seq);
189 // swap the clr's
190 TReadPos tmp = begClr;
191 begClr = length(ungapped_seq) - endClr;
192 endClr = length(ungapped_seq) - tmp;
193 }
194 _gap_sequence(iter_cr->gaps, ungapped_seq, begClr, endClr, gapped_read);
195
196 // increase vote
197 append(column, TAssignedReadChar(_sequenceCharacter(gapped_read[p - offset]), iter_cr->readId, _positionInRead(gapped_read[p - offset])));
198
199 ++iter_cr;
200 }
201
202 // inspect column and mark as candidate ..
203 if( isCandidate(fragStore, contigId, gapped_consensus[p], column, algo_spec ) )
204 {
205 appendValue(candidates, TAnnotatedCandidateColumn(p, column));
206 }
207 }
208 }
209
210
211
212
213 #endif
214