1 // ==========================================================================
2 // SeqAn - The Library for Sequence Analysis
3 // ==========================================================================
4 // Copyright (c) 2006-2018, Knut Reinert, FU Berlin
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are met:
9 //
10 // * Redistributions of source code must retain the above copyright
11 // notice, this list of conditions and the following disclaimer.
12 // * Redistributions in binary form must reproduce the above copyright
13 // notice, this list of conditions and the following disclaimer in the
14 // documentation and/or other materials provided with the distribution.
15 // * Neither the name of Knut Reinert or the FU Berlin nor the names of
16 // its contributors may be used to endorse or promote products derived
17 // from this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29 // DAMAGE.
30 //
31 // ==========================================================================
32 // Author: Tobias Rausch <rausch@embl.de>
33 // Author: Anne-Katrin Emde <anne-katrin.emde@fu-berlin.de>
34 // ==========================================================================
35
36 #ifndef SEQAN_INCLUDE_SEQAN_GRAPH_ALGORITHM_REFINE_INEXACT_H_
37 #define SEQAN_INCLUDE_SEQAN_GRAPH_ALGORITHM_REFINE_INEXACT_H_
38
39 //SEQAN_NO_DDDOC: do not generate documentation for this file
40
41 namespace seqan {
42
43
44 struct TagInexactRefinement_;
45 typedef Tag<TagInexactRefinement_> const InexactRefinement;
46
47
48 ///inexact refinement (cuts that would produce segments shorter than min_len are not made)
49 template<typename TValue, typename TValue2, typename TSize>
50 inline bool
_cutIsValid(String<std::set<TValue>> & all_nodes,TValue2 seq_i_pos,TSize pos_i,typename std::set<TValue>::iterator iter,TSize min_len,Tag<TagInexactRefinement_> const)51 _cutIsValid(String<std::set<TValue> > & all_nodes,
52 TValue2 seq_i_pos,
53 TSize pos_i,
54 typename std::set<TValue>::iterator iter,
55 TSize min_len,
56 Tag<TagInexactRefinement_> const)
57 {
58
59 //cut already exists
60 if(iter != all_nodes[seq_i_pos].end())
61 return false;
62 typename std::set<TValue>::iterator tmp_iter = all_nodes[seq_i_pos].upper_bound(pos_i);
63 if(tmp_iter != all_nodes[seq_i_pos].end())
64 if((*tmp_iter - pos_i) < min_len)
65 return false;
66 if(tmp_iter != all_nodes[seq_i_pos].begin())
67 {
68 --tmp_iter;
69 if((pos_i - *tmp_iter) < min_len)
70 return false;
71 }
72 return true;
73 }
74
75
76 // returns node begin or end position closest to pos
77 // i.e. closest refined position
78 template<typename TAliGraph, typename TVertexDescriptor, typename TId, typename TPosition>
79 TPosition
_getClosestRefinedNeighbor(TAliGraph & ali_g,TVertexDescriptor & vd,TId,TPosition pos)80 _getClosestRefinedNeighbor(TAliGraph & ali_g,
81 TVertexDescriptor & vd,
82 TId /*seq*/,
83 TPosition pos)
84 {
85 if(pos-fragmentBegin(ali_g,vd) < fragmentBegin(ali_g,vd)+fragmentLength(ali_g,vd)-pos)
86 return fragmentBegin(ali_g,vd);
87 else
88 return fragmentBegin(ali_g,vd) + fragmentLength(ali_g,vd);
89 }
90
91
92 // get closest refined position to end position of fragment (cut_end_pos)
93 // and corresponding node (end_knot)
94 template<typename TAliGraph, typename TId, typename TPosition>
95 void
_getCutEndPos(TAliGraph & ali_g,typename VertexDescriptor<TAliGraph>::Type & end_knot,TId seq,TPosition act_begin_pos,TPosition end_pos,TPosition & cut_end_pos)96 _getCutEndPos(TAliGraph & ali_g,
97 typename VertexDescriptor<TAliGraph>::Type & end_knot,
98 TId seq,
99 TPosition act_begin_pos,
100 TPosition end_pos,
101 TPosition & cut_end_pos)
102 {
103 end_knot = findVertex(ali_g,seq,end_pos-1);//end_pos1 is the first position of the next node
104 if(end_pos == fragmentBegin(ali_g,end_knot) + fragmentBegin(ali_g,end_knot))
105 cut_end_pos = end_pos;
106 else
107 {
108 cut_end_pos = _getClosestRefinedNeighbor(ali_g,end_knot,seq,end_pos);
109 if(cut_end_pos <= act_begin_pos) end_knot = getNil<typename VertexDescriptor<TAliGraph>::Type>();
110 else
111 {
112 end_knot = findVertex(ali_g,seq,cut_end_pos-1);
113 SEQAN_ASSERT(cut_end_pos == fragmentBegin(ali_g,end_knot)+fragmentLength(ali_g,end_knot));
114 }
115 }
116 }
117
118
119 // get closest refined position to begin position of fragment (cut_act_pos)
120 // and corresponding node (act_knot)
121 template<typename TAliGraph, typename TId, typename TPosition>
122 void
_getCutBeginPos(TAliGraph & ali_g,typename VertexDescriptor<TAliGraph>::Type & act_knot,TId seq,TPosition act_end_pos,TPosition act_pos,TPosition & cut_act_pos)123 _getCutBeginPos(TAliGraph & ali_g,
124 typename VertexDescriptor<TAliGraph>::Type & act_knot,
125 TId seq,
126 TPosition act_end_pos,
127 TPosition act_pos,
128 TPosition & cut_act_pos)
129 {
130
131 act_knot = findVertex(ali_g,seq,act_pos);
132 //if completely refined
133 if(act_pos == fragmentBegin(ali_g,act_knot))
134 cut_act_pos = act_pos;
135 else //if incompletely refined
136 {
137 cut_act_pos = _getClosestRefinedNeighbor(ali_g,act_knot,seq,act_pos);
138 if(cut_act_pos > act_end_pos) act_knot = getNil<typename VertexDescriptor<TAliGraph>::Type>();
139 else
140 {
141 act_knot = findVertex(ali_g,seq,cut_act_pos); // have to watch out with cut_act_pos==seqLength!
142 SEQAN_ASSERT(act_knot == getNil<typename VertexDescriptor<TAliGraph>::Type>() ||
143 cut_act_pos == fragmentBegin(ali_g, act_knot));
144 }
145 }
146 }
147
148
149
150 //step 2 of constructing the refined alignment graph: add all edges
151 //version for inexact refinement
152 template<typename TAlignmentString,typename TPropertyMap,typename TStringSet,typename TSeqMap, typename TScore,typename TAliGraph>
153 void
_makeRefinedGraphEdges(TAlignmentString & alis,TPropertyMap &,TStringSet & seqs,TSeqMap & seq_map,TScore & score_type,TAliGraph & ali_g,Tag<TagInexactRefinement_> const)154 _makeRefinedGraphEdges(TAlignmentString & alis,
155 TPropertyMap & , //pm,
156 TStringSet & seqs,
157 TSeqMap & seq_map,
158 TScore & score_type,
159 TAliGraph & ali_g,
160 Tag<TagInexactRefinement_> const)
161 {
162 typedef typename Value<TAlignmentString>::Type TAlign;
163 typedef typename Position<TAlign>::Type TPosition;
164 typedef typename Id<TAlign>::Type TId;
165 typedef typename Iterator<TAlignmentString, Rooted>::Type TAliIterator;
166 typedef typename VertexDescriptor<TAliGraph>::Type TVertexDescriptor;
167 typedef typename EdgeDescriptor<TAliGraph>::Type TEdgeDescriptor;
168 //typedef typename Cargo<TAliGraph>::Type TCargo;
169
170 TVertexDescriptor nilVertex = getNil<TVertexDescriptor>();
171 //make edges
172 TAliIterator ali_it = begin(alis);
173 TAliIterator ali_end = end(alis);
174 //for each segment/fragment/alignment
175 while(ali_it != ali_end)
176 {
177 //get first sequence that takes part in the alignment + boundaries of the ali
178 TId seq1;
179 TPosition begin_pos1,end_pos1;
180 _getSeqBeginAndEnd(*ali_it,seq_map,seq1,begin_pos1,end_pos1,(TId)0);
181
182 //get second sequence that takes part in the alignment + boundaries of the ali
183 TId seq2;
184 TPosition begin_pos2,end_pos2;
185 _getSeqBeginAndEnd(*ali_it,seq_map,seq2,begin_pos2,end_pos2,(TId)1);
186
187 //get the last node that is within the current ali
188 TVertexDescriptor end_knot1;
189 TPosition cut_end_pos1;
190 _getCutEndPos(ali_g,end_knot1,seq1,begin_pos1,end_pos1,cut_end_pos1);
191 if(end_knot1 == nilVertex) // there is no node --> fragment disappeared in min_frag_len heuristic
192 continue;
193
194 //get the node that represents the current interval (begin_pos until next_cut_pos or end_pos)
195 TVertexDescriptor act_knot1;
196 TPosition cut_act_pos1,act_pos1;
197 act_pos1 = begin_pos1;
198 _getCutBeginPos(ali_g,act_knot1,seq1,end_pos1,act_pos1,cut_act_pos1);
199 if(act_knot1 == nilVertex) // there is no node, can this happen here?
200 continue;
201 TPosition act_end_pos1 = cut_act_pos1 + fragmentLength(ali_g,act_knot1);
202 //walk through cuts on the first sequence
203 // while (act_end_pos1 <= cut_end_pos1)
204 while (true)
205 {
206 //get other sequence and projected position
207 //TId seq2;
208 TPosition act_pos2;
209 _getOtherSequenceAndProject(*ali_it,0,seq_map,seq1,act_pos1,seq2,act_pos2);
210
211 //get node that corresponds to that position
212 TVertexDescriptor act_knot2;
213 TPosition cut_act_pos2;
214 _getCutBeginPos(ali_g,act_knot2,seq2,end_pos2,act_pos2,cut_act_pos2);
215 if(act_knot2 == nilVertex) // there is no corresponding node in second sequence
216 break;
217 //corresponding end on seq2 (there might be more than one node on seq2 that corresponds
218 //to the same interval (=node) on seq1)
219 TPosition act_end_pos2;
220 _getOtherSequenceAndProject(*ali_it,0,seq_map,seq1,_min(act_end_pos1,end_pos1)-1,seq2,act_end_pos2);
221 ++act_end_pos2;
222 TVertexDescriptor act_end_knot2;
223 TPosition cut_act_end_pos2;
224 _getCutEndPos(ali_g,act_end_knot2,seq2,begin_pos2,act_end_pos2,cut_act_end_pos2);
225 if(act_end_knot2 == nilVertex) // there is no node at all in second sequence
226 break;
227
228 if(cut_act_pos2 == cut_act_end_pos2)
229 break;
230 while(true)
231 {
232 //should at the moment return score for:
233 //
234 //seq1 = ....cr...rc....
235 // ||||||
236 //seq2 = ...c.r...rc....
237 //bzw
238 //seq1 = ..cr.....x.... man will aber nur ..cr......x....
239 // |||||||- ---||||||
240 //seq2 = ...r.c...rc... ...r.c...rc....
241 typename Value<TScore>::Type score = 0;
242 score = _getRefinedMatchScore(score_type,seqs,*ali_it,act_pos1,act_pos2,act_end_pos1-act_pos1,cut_act_end_pos2);
243 //score *= _getRefinedAnnoScore(ali_g,pm,act_knot1,act_knot2,score_type);
244 //add score for
245 //
246 //seq1 = ...-cr....x....
247 // ||
248 //seq2 = ...c.r...rc....
249 // score += getLeftRestScore(score_type,seqs,seq1,seq2,act_pos1,cut_act_pos1,act_pos2,cut_act_pos2);
250 if(score > 0)
251 { if(findEdge(ali_g,act_knot1,act_knot2)==0)
252 addEdge(ali_g,act_knot1,act_knot2,score);
253 else
254 {
255 TEdgeDescriptor ed = findEdge(ali_g, act_knot1, act_knot2);
256 //if((TCargo)score > getCargo(ed))
257 //assignCargo(ed, score);
258 assignCargo(ed, getCargo(ed)+score);
259 }
260 }
261 if(act_knot2==act_end_knot2)
262 break;
263 act_pos2 = cut_act_pos2 + fragmentLength(ali_g,act_knot2);
264 _getCutBeginPos(ali_g,act_knot2,seq2,end_pos2,act_pos2,cut_act_pos2);
265 }
266 if(act_knot1 == end_knot1)
267 break;
268 act_pos1 = act_end_pos1;
269 act_knot1 = findVertex(ali_g,seq1,act_pos1);
270 cut_act_pos1 = act_pos1;
271 act_end_pos1 = cut_act_pos1 + fragmentLength(ali_g,act_knot1);
272 }
273 ++ali_it;
274 }
275 }
276
277 // TODO(holtgrew): Documentation is incomplete.
278
279 /*!
280 * @fn matchRefinement
281 * @headerfile <seqan/graph_align.h>
282 * @brief Refines (i.e. cuts into smaller parts) a set of pairwise segment matches in such a way that none of the
283 * segments partly overlap. They are either identical (fully overlapping) or non-overlapping.
284 *
285 * @signature void matchRefinement(matches, stringSet[, scoringScheme], refinedGraph);
286 *
287 * @param[out] matches The set of matches. Types: Fragment, Align, Alignment Graph
288 * @param[out] refinedGraph The resulting refined set of matches stored in a graph. Types: Alignment Graph
289 * @param[out] stringSet The StringSet containing the sequences which the matches lie on. Types: StringSet
290 * @param[in] scoringScheme The scoring scheme used to score the refined matches (scores are attached to edges
291 * in the refined AlignmentGraph). If no scoring scheme is given, all edges get weight 1.
292 * Types: Score
293 */
294
295 //score type given, min fragment length given, if > 1 ==> inexact refinement
296 template<typename TAlignmentString, typename TScoreValue,typename TScoreSpec, typename TOutGraph, typename TSequence, typename TSetSpec>
297 void
matchRefinement(TAlignmentString & alis,StringSet<TSequence,TSetSpec> & seq,Score<TScoreValue,TScoreSpec> & score_type,TOutGraph & ali_graph,unsigned int min_frag_len)298 matchRefinement(TAlignmentString & alis,
299 StringSet<TSequence, TSetSpec> & seq,
300 Score<TScoreValue,TScoreSpec> & score_type,
301 TOutGraph & ali_graph,
302 unsigned int min_frag_len)
303 {
304 bool anno = false;
305 if(min_frag_len > 1)
306 matchRefinement(alis,seq,score_type,ali_graph,min_frag_len,anno,InexactRefinement());
307 else
308 matchRefinement(alis,seq,score_type,ali_graph,min_frag_len,anno,ExactRefinement());
309 }
310
311
312 //score type not given, min fragment length given, if > 1 ==> inexact refinement
313 template<typename TAlignmentString, typename TOutGraph, typename TSequence, typename TSetSpec>
314 void
matchRefinement(TAlignmentString & alis,StringSet<TSequence,TSetSpec> & seq,TOutGraph & ali_graph,unsigned int min_frag_len)315 matchRefinement(TAlignmentString & alis,
316 StringSet<TSequence, TSetSpec> & seq,
317 TOutGraph & ali_graph,
318 unsigned int min_frag_len)
319 {
320 // Score<int,FakeScore > fake_score;
321 typename Cargo<TOutGraph>::Type fake_score = 1;
322 bool anno = false;
323 if(min_frag_len > 1)
324 matchRefinement(alis,seq,fake_score,ali_graph,min_frag_len,anno,InexactRefinement());
325 else
326 matchRefinement(alis,seq,fake_score,ali_graph,min_frag_len,anno,ExactRefinement());
327 }
328
329 } // namespace seqan
330
331 #endif // #ifndef SEQAN_INCLUDE_SEQAN_GRAPH_ALGORITHM_REFINE_INEXACT_H_
332