1 // ==========================================================================
2 //                 SeqAn - The Library for Sequence Analysis
3 // ==========================================================================
4 // Copyright (c) 2006-2018, Knut Reinert, FU Berlin
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are met:
9 //
10 //     * Redistributions of source code must retain the above copyright
11 //       notice, this list of conditions and the following disclaimer.
12 //     * Redistributions in binary form must reproduce the above copyright
13 //       notice, this list of conditions and the following disclaimer in the
14 //       documentation and/or other materials provided with the distribution.
15 //     * Neither the name of Knut Reinert or the FU Berlin nor the names of
16 //       its contributors may be used to endorse or promote products derived
17 //       from this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29 // DAMAGE.
30 //
31 // ==========================================================================
32 // Author: Tobias Rausch <rausch@embl.de>
33 // Author: Anne-Katrin Emde <anne-katrin.emde@fu-berlin.de>
34 // ==========================================================================
35 
36 #ifndef SEQAN_INCLUDE_SEQAN_GRAPH_ALGORITHM_REFINE_INEXACT_H_
37 #define SEQAN_INCLUDE_SEQAN_GRAPH_ALGORITHM_REFINE_INEXACT_H_
38 
39 //SEQAN_NO_DDDOC: do not generate documentation for this file
40 
41 namespace seqan {
42 
43 
44 struct TagInexactRefinement_;
45 typedef Tag<TagInexactRefinement_> const InexactRefinement;
46 
47 
48 ///inexact refinement (cuts that would produce segments shorter than min_len are not made)
49 template<typename TValue, typename TValue2, typename TSize>
50 inline bool
_cutIsValid(String<std::set<TValue>> & all_nodes,TValue2 seq_i_pos,TSize pos_i,typename std::set<TValue>::iterator iter,TSize min_len,Tag<TagInexactRefinement_> const)51 _cutIsValid(String<std::set<TValue> > & all_nodes,
52         TValue2 seq_i_pos,
53         TSize pos_i,
54         typename std::set<TValue>::iterator iter,
55         TSize min_len,
56         Tag<TagInexactRefinement_> const)
57 {
58 
59     //cut already exists
60     if(iter != all_nodes[seq_i_pos].end())
61         return false;
62     typename std::set<TValue>::iterator tmp_iter = all_nodes[seq_i_pos].upper_bound(pos_i);
63     if(tmp_iter != all_nodes[seq_i_pos].end())
64         if((*tmp_iter - pos_i) < min_len)
65             return false;
66     if(tmp_iter != all_nodes[seq_i_pos].begin())
67     {
68         --tmp_iter;
69         if((pos_i - *tmp_iter) < min_len)
70             return false;
71     }
72     return true;
73 }
74 
75 
76 // returns node begin or end position closest to pos
77 // i.e. closest refined position
78 template<typename TAliGraph, typename TVertexDescriptor, typename TId, typename TPosition>
79 TPosition
_getClosestRefinedNeighbor(TAliGraph & ali_g,TVertexDescriptor & vd,TId,TPosition pos)80 _getClosestRefinedNeighbor(TAliGraph & ali_g,
81                            TVertexDescriptor & vd,
82                            TId /*seq*/,
83                            TPosition pos)
84 {
85     if(pos-fragmentBegin(ali_g,vd) < fragmentBegin(ali_g,vd)+fragmentLength(ali_g,vd)-pos)
86         return fragmentBegin(ali_g,vd);
87     else
88         return fragmentBegin(ali_g,vd) + fragmentLength(ali_g,vd);
89 }
90 
91 
92 // get closest refined position to end position of fragment (cut_end_pos)
93 // and corresponding node (end_knot)
94 template<typename TAliGraph, typename TId, typename TPosition>
95 void
_getCutEndPos(TAliGraph & ali_g,typename VertexDescriptor<TAliGraph>::Type & end_knot,TId seq,TPosition act_begin_pos,TPosition end_pos,TPosition & cut_end_pos)96 _getCutEndPos(TAliGraph & ali_g,
97               typename VertexDescriptor<TAliGraph>::Type & end_knot,
98               TId seq,
99               TPosition act_begin_pos,
100               TPosition end_pos,
101               TPosition & cut_end_pos)
102 {
103     end_knot = findVertex(ali_g,seq,end_pos-1);//end_pos1 is the first position of the next node
104     if(end_pos == fragmentBegin(ali_g,end_knot) + fragmentBegin(ali_g,end_knot))
105         cut_end_pos = end_pos;
106     else
107     {
108         cut_end_pos = _getClosestRefinedNeighbor(ali_g,end_knot,seq,end_pos);
109         if(cut_end_pos <= act_begin_pos) end_knot = getNil<typename VertexDescriptor<TAliGraph>::Type>();
110         else
111         {
112             end_knot =  findVertex(ali_g,seq,cut_end_pos-1);
113             SEQAN_ASSERT(cut_end_pos == fragmentBegin(ali_g,end_knot)+fragmentLength(ali_g,end_knot));
114         }
115     }
116 }
117 
118 
119 // get closest refined position to begin position of fragment (cut_act_pos)
120 // and corresponding node (act_knot)
121 template<typename TAliGraph, typename TId, typename TPosition>
122 void
_getCutBeginPos(TAliGraph & ali_g,typename VertexDescriptor<TAliGraph>::Type & act_knot,TId seq,TPosition act_end_pos,TPosition act_pos,TPosition & cut_act_pos)123 _getCutBeginPos(TAliGraph & ali_g,
124               typename VertexDescriptor<TAliGraph>::Type & act_knot,
125               TId seq,
126               TPosition act_end_pos,
127               TPosition act_pos,
128               TPosition & cut_act_pos)
129 {
130 
131     act_knot = findVertex(ali_g,seq,act_pos);
132     //if completely refined
133     if(act_pos == fragmentBegin(ali_g,act_knot))
134         cut_act_pos = act_pos;
135     else //if incompletely refined
136     {
137         cut_act_pos = _getClosestRefinedNeighbor(ali_g,act_knot,seq,act_pos);
138         if(cut_act_pos > act_end_pos) act_knot = getNil<typename VertexDescriptor<TAliGraph>::Type>();
139         else
140         {
141             act_knot =  findVertex(ali_g,seq,cut_act_pos); // have to watch out with cut_act_pos==seqLength!
142             SEQAN_ASSERT(act_knot == getNil<typename VertexDescriptor<TAliGraph>::Type>() ||
143                          cut_act_pos == fragmentBegin(ali_g, act_knot));
144         }
145     }
146 }
147 
148 
149 
150 //step 2 of constructing the refined alignment graph: add all edges
151 //version for inexact refinement
152 template<typename TAlignmentString,typename TPropertyMap,typename TStringSet,typename TSeqMap, typename TScore,typename TAliGraph>
153 void
_makeRefinedGraphEdges(TAlignmentString & alis,TPropertyMap &,TStringSet & seqs,TSeqMap & seq_map,TScore & score_type,TAliGraph & ali_g,Tag<TagInexactRefinement_> const)154 _makeRefinedGraphEdges(TAlignmentString & alis,
155                        TPropertyMap & , //pm,
156                       TStringSet & seqs,
157                       TSeqMap & seq_map,
158                       TScore & score_type,
159                       TAliGraph & ali_g,
160                       Tag<TagInexactRefinement_> const)
161 {
162     typedef typename Value<TAlignmentString>::Type TAlign;
163     typedef typename Position<TAlign>::Type TPosition;
164     typedef typename Id<TAlign>::Type TId;
165     typedef typename Iterator<TAlignmentString, Rooted>::Type TAliIterator;
166     typedef typename VertexDescriptor<TAliGraph>::Type TVertexDescriptor;
167     typedef typename EdgeDescriptor<TAliGraph>::Type TEdgeDescriptor;
168     //typedef typename Cargo<TAliGraph>::Type TCargo;
169 
170         TVertexDescriptor nilVertex = getNil<TVertexDescriptor>();
171     //make edges
172     TAliIterator ali_it = begin(alis);
173     TAliIterator ali_end = end(alis);
174     //for each segment/fragment/alignment
175     while(ali_it != ali_end)
176     {
177         //get first sequence that takes part in the alignment + boundaries of the ali
178         TId seq1;
179         TPosition begin_pos1,end_pos1;
180         _getSeqBeginAndEnd(*ali_it,seq_map,seq1,begin_pos1,end_pos1,(TId)0);
181 
182         //get second sequence that takes part in the alignment + boundaries of the ali
183         TId seq2;
184         TPosition begin_pos2,end_pos2;
185         _getSeqBeginAndEnd(*ali_it,seq_map,seq2,begin_pos2,end_pos2,(TId)1);
186 
187         //get the last node that is within the current ali
188         TVertexDescriptor end_knot1;
189         TPosition cut_end_pos1;
190         _getCutEndPos(ali_g,end_knot1,seq1,begin_pos1,end_pos1,cut_end_pos1);
191         if(end_knot1 == nilVertex) // there is no node --> fragment disappeared in min_frag_len heuristic
192             continue;
193 
194         //get the node that represents the current interval (begin_pos until next_cut_pos or end_pos)
195         TVertexDescriptor act_knot1;
196         TPosition cut_act_pos1,act_pos1;
197         act_pos1 = begin_pos1;
198         _getCutBeginPos(ali_g,act_knot1,seq1,end_pos1,act_pos1,cut_act_pos1);
199         if(act_knot1 == nilVertex) // there is no node, can this happen here?
200             continue;
201         TPosition act_end_pos1 = cut_act_pos1 + fragmentLength(ali_g,act_knot1);
202         //walk through cuts on the first sequence
203 //        while (act_end_pos1 <= cut_end_pos1)
204         while (true)
205         {
206             //get other sequence and projected position
207             //TId seq2;
208             TPosition act_pos2;
209             _getOtherSequenceAndProject(*ali_it,0,seq_map,seq1,act_pos1,seq2,act_pos2);
210 
211             //get node that corresponds to that position
212             TVertexDescriptor act_knot2;
213             TPosition cut_act_pos2;
214             _getCutBeginPos(ali_g,act_knot2,seq2,end_pos2,act_pos2,cut_act_pos2);
215             if(act_knot2 == nilVertex) // there is no corresponding node in second sequence
216                 break;
217             //corresponding end on seq2 (there might be more than one node on seq2 that corresponds
218             //to the same interval (=node) on seq1)
219             TPosition act_end_pos2;
220              _getOtherSequenceAndProject(*ali_it,0,seq_map,seq1,_min(act_end_pos1,end_pos1)-1,seq2,act_end_pos2);
221             ++act_end_pos2;
222             TVertexDescriptor act_end_knot2;
223             TPosition cut_act_end_pos2;
224             _getCutEndPos(ali_g,act_end_knot2,seq2,begin_pos2,act_end_pos2,cut_act_end_pos2);
225             if(act_end_knot2 == nilVertex) // there is no node at all in second sequence
226                 break;
227 
228             if(cut_act_pos2 == cut_act_end_pos2)
229                 break;
230             while(true)
231             {
232                 //should at the moment return score for:
233                 //
234                 //seq1 = ....cr...rc....
235                 //            ||||||
236                 //seq2 = ...c.r...rc....
237                 //bzw
238                 //seq1 = ..cr.....x....   man will aber nur    ..cr......x....
239                 //          |||||||-                             ---||||||
240                 //seq2 = ...r.c...rc...                        ...r.c...rc....
241                 typename Value<TScore>::Type score = 0;
242                 score = _getRefinedMatchScore(score_type,seqs,*ali_it,act_pos1,act_pos2,act_end_pos1-act_pos1,cut_act_end_pos2);
243                 //score *= _getRefinedAnnoScore(ali_g,pm,act_knot1,act_knot2,score_type);
244                 //add score for
245                 //
246                 //seq1 = ...-cr....x....
247                 //          ||
248                 //seq2 = ...c.r...rc....
249 //                    score += getLeftRestScore(score_type,seqs,seq1,seq2,act_pos1,cut_act_pos1,act_pos2,cut_act_pos2);
250                 if(score > 0)
251                 {    if(findEdge(ali_g,act_knot1,act_knot2)==0)
252                         addEdge(ali_g,act_knot1,act_knot2,score);
253                     else
254                     {
255                         TEdgeDescriptor ed = findEdge(ali_g, act_knot1, act_knot2);
256                         //if((TCargo)score > getCargo(ed))
257                             //assignCargo(ed, score);
258                         assignCargo(ed, getCargo(ed)+score);
259                     }
260                 }
261                 if(act_knot2==act_end_knot2)
262                     break;
263                 act_pos2 = cut_act_pos2 + fragmentLength(ali_g,act_knot2);
264                 _getCutBeginPos(ali_g,act_knot2,seq2,end_pos2,act_pos2,cut_act_pos2);
265             }
266             if(act_knot1 == end_knot1)
267                 break;
268             act_pos1 = act_end_pos1;
269             act_knot1 = findVertex(ali_g,seq1,act_pos1);
270             cut_act_pos1 = act_pos1;
271             act_end_pos1 = cut_act_pos1 + fragmentLength(ali_g,act_knot1);
272         }
273         ++ali_it;
274     }
275 }
276 
277 // TODO(holtgrew): Documentation is incomplete.
278 
279 /*!
280  * @fn matchRefinement
281  * @headerfile <seqan/graph_align.h>
282  * @brief Refines (i.e. cuts into smaller parts) a set of pairwise segment matches in such a way that none of the
283  *        segments partly overlap. They are either identical (fully overlapping) or non-overlapping.
284  *
285  * @signature void matchRefinement(matches, stringSet[, scoringScheme], refinedGraph);
286  *
287  * @param[out] matches       The set of matches. Types: Fragment, Align, Alignment Graph
288  * @param[out] refinedGraph  The resulting refined set of matches stored in a graph.  Types: Alignment Graph
289  * @param[out] stringSet     The StringSet containing the sequences which the matches lie on. Types: StringSet
290  * @param[in]  scoringScheme The scoring scheme used to score the refined matches (scores are attached to edges
291  *                           in the refined AlignmentGraph).  If no scoring scheme is given, all edges get weight 1.
292  *                           Types: Score
293  */
294 
295 //score type given, min fragment length given, if > 1 ==> inexact refinement
296 template<typename TAlignmentString, typename TScoreValue,typename TScoreSpec, typename TOutGraph, typename TSequence, typename TSetSpec>
297 void
matchRefinement(TAlignmentString & alis,StringSet<TSequence,TSetSpec> & seq,Score<TScoreValue,TScoreSpec> & score_type,TOutGraph & ali_graph,unsigned int min_frag_len)298 matchRefinement(TAlignmentString & alis,
299                 StringSet<TSequence, TSetSpec> & seq,
300                 Score<TScoreValue,TScoreSpec> & score_type,
301                 TOutGraph & ali_graph,
302                 unsigned int min_frag_len)
303 {
304     bool anno = false;
305     if(min_frag_len > 1)
306         matchRefinement(alis,seq,score_type,ali_graph,min_frag_len,anno,InexactRefinement());
307     else
308         matchRefinement(alis,seq,score_type,ali_graph,min_frag_len,anno,ExactRefinement());
309 }
310 
311 
312 //score type not given, min fragment length given, if > 1 ==> inexact refinement
313 template<typename TAlignmentString, typename TOutGraph, typename TSequence, typename TSetSpec>
314 void
matchRefinement(TAlignmentString & alis,StringSet<TSequence,TSetSpec> & seq,TOutGraph & ali_graph,unsigned int min_frag_len)315 matchRefinement(TAlignmentString & alis,
316                 StringSet<TSequence, TSetSpec> & seq,
317                 TOutGraph & ali_graph,
318                 unsigned int min_frag_len)
319 {
320 //    Score<int,FakeScore > fake_score;
321     typename Cargo<TOutGraph>::Type fake_score = 1;
322     bool anno = false;
323     if(min_frag_len > 1)
324         matchRefinement(alis,seq,fake_score,ali_graph,min_frag_len,anno,InexactRefinement());
325     else
326         matchRefinement(alis,seq,fake_score,ali_graph,min_frag_len,anno,ExactRefinement());
327 }
328 
329 }  // namespace seqan
330 
331 #endif  // #ifndef SEQAN_INCLUDE_SEQAN_GRAPH_ALGORITHM_REFINE_INEXACT_H_
332