1 // ==========================================================================
2 // SeqAn - The Library for Sequence Analysis
3 // ==========================================================================
4 // Copyright (c) 2006-2018, Knut Reinert, FU Berlin
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are met:
9 //
10 // * Redistributions of source code must retain the above copyright
11 // notice, this list of conditions and the following disclaimer.
12 // * Redistributions in binary form must reproduce the above copyright
13 // notice, this list of conditions and the following disclaimer in the
14 // documentation and/or other materials provided with the distribution.
15 // * Neither the name of Knut Reinert or the FU Berlin nor the names of
16 // its contributors may be used to endorse or promote products derived
17 // from this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29 // DAMAGE.
30 //
31 // ==========================================================================
32 // Author: Tobias Rausch <rausch@embl.de>
33 // Author: Anne-Katrin Emde <anne-katrin.emde@fu-berlin.de>
34 // ==========================================================================
35 // Iterative instead of recursive implementation of segment match refinement.
36 // This is required for some large inputs to circumvent stack overflows.
37 // ==========================================================================
38
39 #ifndef SEQAN_INCLUDE_SEQAN_GRAPH_ALGORITHM_REFINE_EXACT_ITERATIVE_H_
40 #define SEQAN_INCLUDE_SEQAN_GRAPH_ALGORITHM_REFINE_EXACT_ITERATIVE_H_
41
42 namespace seqan {
43
44 struct TagExactRefinement_;
45 typedef Tag<TagExactRefinement_> const ExactRefinement;
46
47 // exact method, every cut is made (unless it already exists)
48 template<typename TValue, typename TValue2, typename TSize>
49 inline bool
_cutIsValid(String<std::set<TValue>> & all_nodes,TValue2 seq_i_pos,TSize,typename std::set<TValue>::iterator iter,TSize,Tag<TagExactRefinement_> const)50 _cutIsValid(String<std::set<TValue> > & all_nodes,
51 TValue2 seq_i_pos,
52 TSize,
53 typename std::set<TValue>::iterator iter,
54 TSize,
55 Tag<TagExactRefinement_> const)
56 {
57 //cut already exists
58 if(iter != all_nodes[seq_i_pos].end())
59 return false;
60 return true;
61 }
62
63
64 // necessary for reversed fragments: projected position pos_j is shifted one to the left --> ++pos_j if fragement reversed
65 template<typename TSize, typename TSpec,typename TPos>
66 inline void
_updateCutPosition(Fragment<TSize,ExactReversableFragment<TSpec>> & f,TPos & pos_j)67 _updateCutPosition(Fragment<TSize, ExactReversableFragment<TSpec> > & f, TPos & pos_j)
68 {
69 if(f.reversed)
70 ++pos_j;
71 }
72 //template<typename TSize, typename TSpec,typename TPos>
73 //inline void
74 //_updateCutPosition(Fragment<TSize, ExactReversableFragment<TSpec> > const& f, TPos & pos_j)
75 //{
76 // if(f.reversed)
77 // ++pos_j;
78 //}
79
80 // for all other fragment types --> no shifting necessary
81 template<typename TFrag,typename TPos>
82 inline void
_updateCutPosition(TFrag &,TPos &)83 _updateCutPosition(TFrag &, TPos &)
84 {
85 return;
86 }
87
88 ///////////////////////////////////////////////////////////////////////////////////////////////////////
89 //Recursive Refinement
90 //refine position node_i on sequence seq_i
91 template<typename TValue, typename TAlignmentString, typename TStringSet,typename TGraph, typename TPropertyMap,typename TSeqMap, typename TTagSpec>
92 inline void
_refine(TValue node_i,TValue seq_i_id,TStringSet & seqs,TSeqMap & seq_map,TAlignmentString & alis,String<TGraph> & gs,String<TPropertyMap> & pms,String<std::set<TValue>> & all_nodes,TValue min_len,Tag<TTagSpec> tag)93 _refine(TValue node_i,
94 TValue seq_i_id,
95 TStringSet & seqs,
96 TSeqMap & seq_map,
97 TAlignmentString & alis,
98 String<TGraph> & gs,
99 String<TPropertyMap> & pms,
100 String<std::set<TValue> > & all_nodes,
101 TValue min_len,
102 Tag<TTagSpec> tag)
103 {
104 typedef typename Cargo<typename Value<TPropertyMap>::Type>::Type TAlignmentPointer;
105 typedef typename Iterator<String<TAlignmentPointer>, Rooted>::Type TSegmentIterator;
106 //find all segment matches that contain the current position (node_i)
107 String<TAlignmentPointer> relevant_segments;
108 TValue seq_i_pos = idToPosition(seqs,seq_i_id);
109 findIntervalsExcludeTouching(relevant_segments, gs[seq_i_pos],pms[seq_i_pos],node_i);
110
111
112 TSegmentIterator segment_it = begin(relevant_segments);
113 TSegmentIterator segment_end = end(relevant_segments);
114 //foreach of those segments
115 while(segment_it != segment_end)
116 {
117 TValue match_id = (*segment_it).i1; // segment match
118 TValue seg_num = (*segment_it).i2; // first or second segment in segment match?
119
120 //get the sequence that node_i needs to be projected onto (seq_j)
121 //and get the projected position (pos_j)
122 TValue seq_j_id, node_j;
123 _getOtherSequenceAndProject(alis[match_id],seg_num,seq_map,seq_i_id,node_i,seq_j_id,node_j);
124 TValue seq_j_pos = idToPosition(seqs,seq_j_id);
125 _updateCutPosition(alis[match_id],node_j);
126
127 typename std::set<TValue>::iterator iter;
128 iter = all_nodes[seq_j_pos].find(node_j);
129
130 //if node does not exist yet ---> insert and continue cutting
131 if(_cutIsValid(all_nodes,seq_j_pos,node_j,iter,min_len,tag))
132 {
133 all_nodes[seq_j_pos].insert(node_j);
134 _refine(node_j,seq_j_id,seqs,seq_map,alis,gs,pms,all_nodes,min_len,tag);
135 }
136 // TODO: else //verschmelzen, abschneiden und ergehen, erst sp�ter...
137 // do nothing or resolve problems
138
139 ++segment_it;
140 }
141 }
142
143 //template<typename TFragSize, typename TFragSpec>
144 //void
145 //printMatch(Fragment<TFragSize,TFragSpec> & f)
146 //{
147 // std::cout << "FRAGMENT:" << " f.len = "<< f.len <<std::endl;
148 // std::cout << "f.seqId1 = "<< f.seqId1 << " f.begin1 = " << f.begin1 << std::endl;
149 // std::cout << "f.seqId2 = "<< f.seqId2 << " f.begin2 = " << f.begin2 << std::endl;
150 //}
151
152 //template<typename TAlign>
153 //void
154 //printMatch(TAlign & f)
155 //{
156 // std::cout << f;
157 //}
158
159 ///////////////////////////////////////////////////////////////////////////////////////////////////////
160 // Construct interval trees
161 ////////////////////////////////////////////////////////////////////////////////////////////////////
162
163
164 //construct intervals from allignments for each sequence (other Alignment types)
165 template<typename TInterval, typename TStringSet, typename TAlignmentString, typename TSeqMap>
166 void
_buildIntervalsForAllSequences(TAlignmentString & alis,String<String<TInterval>> & intervals,TStringSet & seqs,TSeqMap & seq_map)167 _buildIntervalsForAllSequences(TAlignmentString & alis,
168 String<String<TInterval> > & intervals,
169 TStringSet & seqs,
170 TSeqMap & seq_map)
171 {
172
173 typedef typename Value<TInterval>::Type TValue;
174 typedef typename Cargo<TInterval>::Type TCargo;
175 typedef typename Iterator<TAlignmentString,Standard>::Type TAliIterator;
176 TAliIterator ali_it = begin(alis,Standard());
177 TAliIterator ali_end = end(alis,Standard());
178 TValue ali_counter = 0;
179 //foreach alignment
180 while(ali_it != ali_end)
181 {
182 TValue seq_i_id,begin_,end_;
183 //printMatch(*ali_it);
184 //get the first sequence (and its begin and end) that takes part in the alignment (seq_i)
185 _getSeqBeginAndEnd(*ali_it,seq_map,seq_i_id,begin_,end_,0);
186 TValue seq_i_pos = idToPosition(seqs, seq_i_id);
187 //and append the interval (ali_begin, ali_end) with cargo ali* to the list of intervals of seq_i
188 appendValue(intervals[seq_i_pos],IntervalAndCargo<TValue,TCargo>(begin_,end_,TCargo(ali_counter,0)));
189
190 //get the second sequence (and its begin and end) that takes part in the alignment (seq_i)
191 _getSeqBeginAndEnd(*ali_it,seq_map,seq_i_id,begin_,end_,1);
192 seq_i_pos = idToPosition(seqs, seq_i_id);
193 //and again append the interval (ali_begin, ali_end) with cargo ali* to the list of intervals of seq_i
194 appendValue(intervals[seq_i_pos],IntervalAndCargo<TValue,TCargo>(begin_,end_,TCargo(ali_counter,1)));
195
196 ++ali_counter;
197 ++ali_it;
198 }
199
200 }
201
202
203 //get all intervals from the alignments and construct an interval tree for each sequence
204 template<typename TGraph, typename TPropertyMap, typename TAlignmentString, typename TSequence, typename TSetSpec, typename TValue, typename TSeqMap>
205 void
_createTreesForAllSequences(String<TGraph> & gs,String<TPropertyMap> & pms,TAlignmentString & alis,StringSet<TSequence,TSetSpec> & seqs,TSeqMap & seq_map,TValue numSequences)206 _createTreesForAllSequences(String<TGraph> & gs,
207 String<TPropertyMap> & pms,
208 TAlignmentString & alis,
209 StringSet<TSequence,TSetSpec> & seqs,
210 TSeqMap & seq_map,
211 TValue numSequences)
212 {
213 //typedef typename Value<TAlignmentString>::Type TAlignment;
214 // typedef TAlignment* TCargo;
215 typedef Pair<unsigned,unsigned,BitPacked<31,1> > TCargo;
216 typedef IntervalAndCargo<int,TCargo> TInterval;
217 //typedef typename VertexDescriptor<TGraph>::Type TVertexDescriptor;
218
219 //std::cout <<"create interval trees...";
220 // clock_t start, finish1;
221 // double duration;
222 // start = clock();
223 //one tree for each sequence
224 resize(gs,numSequences);
225 resize(pms,numSequences);
226
227 // and one string of intervals for each sequence
228 String<String<TInterval> > intervals;
229 resize(intervals,numSequences);
230 // fill intervals
231 _buildIntervalsForAllSequences(alis,intervals,seqs,seq_map);
232
233 TValue i = 0;
234
235 while(i < numSequences)
236 {
237 //std::cout << (numSequences-i) <<" more ("<<length(intervals[i])<<" intervals)... "<<std::flush;
238 TValue center = length(seqs[i])/2; // center raus, hat hier nix zu suchen
239 //create interval tree!
240 createIntervalTree(gs[i], pms[i], intervals[i], center);
241
242 //intervals for sequence i are not needed anymore
243 clear(intervals[i]);
244 ++i;
245 }
246 // finish1 = clock();
247 // duration = (double)(finish1 - start) / CLOCKS_PER_SEC;
248 // std::cout << "\ntook " << duration << " seconds.\n";
249 }
250
251
252 ///////////////////////////////////////////////////////////////////////////////////////////////////////
253 // Construct refined alignment graph
254 ////////////////////////////////////////////////////////////////////////////////////////////////////
255
256 // step 1 of constructing the refined alignment graph: create all nodes
257 template<typename TStringSet,typename TValue,typename TAliGraph>
258 void
_makeRefinedGraphNodes(String<std::set<TValue>> & all_nodes,TStringSet & seqs,TAliGraph & ali_g)259 _makeRefinedGraphNodes(String<std::set<TValue> > & all_nodes,
260 TStringSet & seqs,
261 TAliGraph & ali_g)
262 {
263 typedef typename std::set<TValue>::iterator TSetIterator;
264 //for each sequence look at all cut positions and create nodes between them
265 for(unsigned int seq_i_pos = 0; seq_i_pos < length(seqs); ++seq_i_pos)
266 {
267 TValue seq_i_id = positionToId(stringSet(ali_g), seq_i_pos);
268 TSetIterator it = all_nodes[seq_i_pos].begin();
269 TSetIterator end_it = all_nodes[seq_i_pos].end();
270 TSetIterator next_it = it;
271 if(next_it != end_it)
272 ++next_it;
273 else
274 addVertex(ali_g, seq_i_id, 0, length(seqs[seq_i_pos]));
275
276 //first unaligned node
277 if(it != end_it && *it != 0)
278 addVertex(ali_g, seq_i_id, 0, *it);
279 //a new node for each interval
280 while(next_it != end_it)
281 {
282 TValue pos_i = *it;
283 addVertex(ali_g, seq_i_id, pos_i, *next_it - pos_i);
284 ++it;
285 ++next_it;
286 }
287 //last unaligned node
288 if(it !=end_it && *it<length(seqs[seq_i_pos]))
289 addVertex(ali_g, seq_i_id, *it, (length(seqs[seq_i_pos])) - *it);
290 all_nodes[seq_i_pos].clear();
291 }
292 }
293
294
295 // step 2 of constructing the refined alignment graph: add all edges
296 // version for exact refinement
297 template<typename TAlignmentString,typename TStringSet,typename TSeqMap, typename TPropertyMap,typename TScore,typename TAliGraph >
298 void
_makeRefinedGraphEdges(TAlignmentString & alis,TPropertyMap & pm,TStringSet & seqs,TSeqMap & seq_map,TScore & score_type,TAliGraph & ali_g,Tag<TagExactRefinement_> const)299 _makeRefinedGraphEdges(TAlignmentString & alis,
300 TPropertyMap & pm,
301 TStringSet & seqs,
302 TSeqMap & seq_map,
303 TScore & score_type,
304 TAliGraph & ali_g,
305 Tag<TagExactRefinement_> const)
306 {
307 typedef typename Value<TAlignmentString>::Type TAlign;
308 typedef typename Size<TAlign>::Type TValue;
309 typedef typename Iterator<TAlignmentString, Rooted>::Type TAliIterator;
310 typedef typename VertexDescriptor<TAliGraph>::Type TVertexDescriptor;
311 typedef typename EdgeDescriptor<TAliGraph>::Type TEdgeDescriptor;
312 typedef typename Cargo<TAliGraph>::Type TCargo;
313 //make edges
314 TAliIterator ali_it = begin(alis);
315 TAliIterator ali_end = end(alis);
316 //for each segment/fragment/alignment
317 while(ali_it != ali_end)
318 {
319 //get sequence, begin position and end position
320 TValue seq_id,begin_pos,end_pos;
321 _getSeqBeginAndEnd(*ali_it,seq_map,seq_id,begin_pos,end_pos,(TValue)0);
322 SEQAN_ASSERT_LEQ(end_pos, length(seqs[idToPosition(seqs, seq_id)]));
323 SEQAN_ASSERT(ali_it.data_container == ali_end.data_container);
324 SEQAN_ASSERT(ali_it.data_iterator != ali_end.data_iterator);
325
326 //get the node represents the current interval (begin_pos until next_cut_pos or end_pos)
327 TVertexDescriptor act_knot = findVertex(ali_g,seq_id,begin_pos);
328 TValue act_pos = begin_pos;
329 TValue seq_j_id_temp,pos_j_begin;
330 _getOtherSequenceAndProject(*ali_it,(TValue)0,seq_map,seq_id,act_pos,seq_j_id_temp,pos_j_begin);
331
332 //for each interval that lies within the current segment/fragement/alignment
333 while(act_pos < end_pos)
334 {
335 //get other sequence and projected position
336 TValue seq_j_id,pos_j;
337 _getOtherSequenceAndProject(*ali_it,(TValue)0,seq_map,seq_id,act_pos,seq_j_id,pos_j);
338 SEQAN_ASSERT_NEQ(pos_j, static_cast<TValue>(-1));
339 //find node that contains the projected position (pos_j)
340 TVertexDescriptor vd = findVertex(ali_g, seq_j_id, pos_j);
341 bool doAddEdge = true;
342
343 // if(doAddEdge && fragmentBegin(ali_g,vd)!=pos_j) // check if edge makes sense
344 if (vd == getNil<TVertexDescriptor>())
345 doAddEdge = false;
346 else
347 {
348 TValue temp_seq_i_id,temp_act_pos;
349 _getOtherSequenceAndProject(*ali_it,(TValue)1,seq_map,seq_j_id,static_cast<TValue>(fragmentBegin(ali_g,vd)),temp_seq_i_id,temp_act_pos);
350 if(temp_act_pos == static_cast<TValue>(-1))
351 doAddEdge = false;
352 else
353 {
354 TVertexDescriptor temp_act_knot = findVertex(ali_g, temp_seq_i_id, temp_act_pos);
355 if(act_knot!=temp_act_knot)
356 doAddEdge = false;
357 }
358 }
359 if(doAddEdge)
360 {
361 typename Value<TScore>::Type score = _getRefinedMatchScore(score_type,seqs,*ali_it,act_pos,pos_j,fragmentLength(ali_g,act_knot),fragmentLength(ali_g,vd));//,fragmentLength(ali_g,vd));
362 // typename Value<TScore>::Type score = fragmentLength(ali_g,vd);
363 score *= _getRefinedAnnoScore(ali_g,pm,vd,act_knot,score_type);
364 //this needs to be generalized (makes sense for positive scores only)
365 if(score <= 0) score = 1;
366 if(score > 0)
367 {
368 if (findEdge(ali_g, act_knot, vd) == 0) {
369 //if(abs((double)fragmentLength(ali_g, act_knot) - (double)fragmentLength(ali_g, vd)) > 20) {
370 // std::cerr << "added edge: " << fragmentLength(ali_g, act_knot) << " " << fragmentLength(ali_g, vd) << std::endl;
371 // std::cerr << *ali_it;
372 // std::cerr << "act_pos=" << act_pos-begin_pos << " pos_j=" << pos_j-pos_j_begin << std::endl;
373 //} else {
374 addEdge(ali_g,act_knot,vd,(TCargo)score);
375 //}
376 }
377 else {
378 TEdgeDescriptor ed = findEdge(ali_g, act_knot, vd);
379 //if((TCargo)score > getCargo(ed))
380 //assignCargo(ed, score);
381 assignCargo(ed, getCargo(ed)+score);
382 }
383 }
384 }
385 //prepare for next interval
386 act_pos += fragmentLength(ali_g,act_knot);
387 act_knot = findVertex(ali_g,seq_id,act_pos);
388 }
389 ++ali_it;
390 }
391 }
392
393
394
395
396
397 //build refined alignment graph, nodes are numbered ascendingly:
398 //seq1 0 1 2 3 4
399 //seq2 5 6 7 8 9 10
400 //seq3 11 12 13 14 15
401 template<typename TValue,typename TAlignmentString,typename TScore,typename TSequence, typename TSetSpec,typename TAliGraph,typename TSeqMap,typename TTagSpec>
402 void
_makeAlignmentGraphFromRefinedSegments(String<std::set<TValue>> & all_nodes,TAlignmentString & alis,TScore & score_type,StringSet<TSequence,TSetSpec> & seqs,TSeqMap & seq_map,TAliGraph & ali_g,Tag<TTagSpec> const tag,bool)403 _makeAlignmentGraphFromRefinedSegments(String<std::set<TValue> > & all_nodes,
404 TAlignmentString & alis,
405 TScore & score_type,
406 StringSet<TSequence, TSetSpec> & seqs,
407 TSeqMap & seq_map,
408 TAliGraph & ali_g,
409 Tag<TTagSpec> const tag,
410 bool)
411 {
412 //std::cout << "making refined alignment graph...";
413 //clock_t start, finish1;
414 //double duration;
415 //start = clock();
416
417 //make nodes (same function for inexact and exact refinement)
418 _makeRefinedGraphNodes(all_nodes,seqs,ali_g);
419
420 bool pm = false;
421 //add edges (different functions depending on exact/inexact refinement)
422 _makeRefinedGraphEdges(alis,pm,seqs,seq_map,score_type,ali_g,tag);
423
424 //std::cout << "check\n";
425 //finish1 = clock();
426 //duration = (double)(finish1 - start) / CLOCKS_PER_SEC;
427 //std::cout << "\ntook " << duration << " seconds.\n";
428 }
429
430
431 //build refined alignment graph as above, but with additional annotation information
432 template<typename TValue,typename TAlignmentString,typename TScore,typename TSequence, typename TSetSpec,typename TAliGraph,typename TSeqMap,typename TAnnoString,typename TTagSpec>
433 void
_makeAlignmentGraphFromRefinedSegments(String<std::set<TValue>> & all_nodes,TAlignmentString & alis,TScore & score_type,StringSet<TSequence,TSetSpec> & seqs,TSeqMap & seq_map,TAliGraph & ali_g,Tag<TTagSpec> const tag,TAnnoString & annotation)434 _makeAlignmentGraphFromRefinedSegments(String<std::set<TValue> > & all_nodes,
435 TAlignmentString & alis,
436 TScore & score_type,
437 StringSet<TSequence, TSetSpec> & seqs,
438 TSeqMap & seq_map,
439 TAliGraph & ali_g,
440 Tag<TTagSpec> const tag,
441 TAnnoString & annotation)
442 {
443 //std::cout << "making refined alignment graph...";
444 //clock_t start, finish1;
445 //double duration;
446 //start = clock();
447
448 //make nodes (same function for inexact and exact refinement)
449 _makeRefinedGraphNodes(all_nodes,seqs,ali_g);
450
451 //add annotation to nodes
452 //typedef typename Value<TAnnoString>::Type TAnnotation;
453 //typedef typename Value<TAnnotation>::Type TLabel;
454 typedef char TLabel;
455 String<String<TLabel> > pm;
456 _addNodeAnnotation(seqs,seq_map,annotation,pm,ali_g,tag);
457
458 //add edges (different functions depending on exact/inexact refinement)
459 _makeRefinedGraphEdges(alis,pm,seqs,seq_map,score_type,ali_g,tag);
460
461 //std::cout << "check\n";
462 //finish1 = clock();
463 //duration = (double)(finish1 - start) / CLOCKS_PER_SEC;
464 //std::cout << "\ntook " << duration << " seconds.\n";
465 }
466
467
468
469
470
471 ////////////////////////////////////////////////////////////////////////////////////////
472 //The big matchRefinement function that does everything: build interval trees, do the
473 //refinement and construct a refined alignment graph
474 ////////////////////////////////////////////////////////////////////////////////////////
475 template<typename TAlignmentString, typename TAnnotation, typename TOutGraph, typename TSequence, typename TSetSpec, typename TScore,typename TTagSpec>
476 void
matchRefinement(TAlignmentString & alis,StringSet<TSequence,TSetSpec> & seq,TScore & score_type,TOutGraph & ali_graph,typename Size<typename Value<TAlignmentString>::Type>::Type min_fragment_len,TAnnotation & annotation,Tag<TTagSpec> const tag)477 matchRefinement(TAlignmentString & alis,
478 StringSet<TSequence, TSetSpec> & seq,
479 TScore & score_type,
480 TOutGraph & ali_graph,
481 typename Size<typename Value<TAlignmentString>::Type>::Type min_fragment_len,
482 TAnnotation & annotation,
483 Tag<TTagSpec> const tag)
484 {
485 ////////////////////////////////////////////////////////////////
486 //typedefs
487 typedef typename Value<TAlignmentString>::Type TAlign;
488 typedef typename Iterator<TAlignmentString, Rooted>::Type TAliIterator;
489 typedef typename Size<TAlign>::Type TValue;
490 // typedef TValue TCargo;
491 // typedef Pair<unsigned,unsigned,BitPacked<31,1> > TCargo;
492 typedef Pair<unsigned,unsigned,BitPacked<31,1> > TCargo;
493 typedef IntervalAndCargo<int,TCargo> TInterval;
494 typedef Graph<Directed<void,WithoutEdgeId> > TGraph;
495 typedef IntervalTreeNode<TInterval> TNode;
496 typedef String<TNode> TPropertyMap;
497 typedef typename std::set<TValue>::iterator TSetIterator;
498 typedef typename Cargo<typename Value<TPropertyMap>::Type>::Type TAlignmentPointer;
499 typedef typename Iterator<String<TAlignmentPointer>, Rooted>::Type TSegmentIterator;
500
501 #ifdef SEQAN_TCOFFEE_DEBUG
502 double refinementTime = sysTime();
503 #endif
504
505 ////////////////////////////////////////////////////////////////
506 TValue numSequences = length(seq);
507 //weird ID --> good ID map
508 std::map<const void * ,int> seq_map;
509 for(int i = 0; i < (int) numSequences; ++i)
510 seq_map[getObjectId(seq[i])] = i;
511 ////////////////////////////////////////////////////////////////
512 //build interval trees
513 String<TGraph> gs;
514 String<TPropertyMap> pms;
515 _createTreesForAllSequences(gs, pms, alis, seq, seq_map, numSequences);
516
517 ////////////////////////////////////////////////////////////////
518 //do refinement
519 //std::cout <<"refining..."<<std::flush;
520 // clock_t start, finish1;
521 // double duration;
522 // start = clock();
523
524 //all_nodes = set of all cut positions
525 String<std::set<TValue> > all_nodes;
526 resize(all_nodes,numSequences);
527
528 //all_nodes that need to be processed set of all cut positions
529 String<std::set<TValue> > all_node_queues;
530 resize(all_node_queues,numSequences);
531
532 //call function _refine for each startknoten
533 TAliIterator ali_it = begin(alis);
534 TAliIterator ali_end = end(alis);
535 //for each segment/fragement/alignment
536 while(ali_it != ali_end)
537 {
538 //for each of the two sequences
539 for(TValue i = 0; i < 2; ++i)
540 {
541 TValue seq_i_id,begin_i,end_i;
542 _getSeqBeginAndEnd(*ali_it,seq_map,seq_i_id,begin_i,end_i,i);
543 TValue seq_i_pos = idToPosition(seq,seq_i_id);
544
545 all_node_queues[seq_i_pos].insert(begin_i);
546 all_node_queues[seq_i_pos].insert(end_i);
547 }
548 ++ali_it;
549 }
550
551
552 TSetIterator queueIt;
553 bool done = false;
554 while(!done)
555 {
556 for(unsigned seq_i_pos = 0; seq_i_pos < numSequences; ++seq_i_pos)
557 {
558 queueIt = all_node_queues[seq_i_pos].begin();
559 while (queueIt != all_node_queues[seq_i_pos].end())
560 {
561 TValue node_i = *queueIt;
562 TSetIterator iter = all_nodes[seq_i_pos].find(node_i);
563 // TSetIterator qiter = all_node_queues[seq_i_pos].find(node_i);
564 if(_cutIsValid(all_nodes,seq_i_pos,node_i,iter,min_fragment_len,tag))
565 //&& _cutIsValid(all_node_queues,seq_i_pos,node_i,qiter,min_fragment_len,tag))
566 // if(iter == all_nodes[seq_i_pos].end())
567 {
568 TValue seq_i_id = positionToId(seq, seq_i_pos);
569 all_nodes[seq_i_pos].insert(node_i);
570 String<TAlignmentPointer> relevant_segments;
571 findIntervalsExcludeTouching(relevant_segments, gs[seq_i_pos],pms[seq_i_pos],node_i);
572
573 TSegmentIterator segment_it = begin(relevant_segments);
574 TSegmentIterator segment_end = end(relevant_segments);
575 //foreach of those segments
576 while(segment_it != segment_end)
577 {
578 TValue match_id = (*segment_it).i1;
579 TValue seg_num = (*segment_it).i2; //get the sequence that node_i needs to be projected onto (seq_j)
580 //and get the projected position (pos_j)
581 TValue seq_j_id, node_j;
582 _getOtherSequenceAndProject(alis[match_id],seg_num,seq_map,seq_i_id,node_i,seq_j_id,node_j);
583 TValue seq_j_pos = idToPosition(seq,seq_j_id);
584 _updateCutPosition(alis[match_id],node_j);
585
586 typename std::set<TValue>::iterator iter_j, qiter_j;
587 iter_j = all_nodes[seq_j_pos].find(node_j);
588 qiter_j = all_node_queues[seq_j_pos].find(node_j);
589
590 //if node does not exist yet ---> insert and continue cutting
591 if(_cutIsValid(all_nodes,seq_j_pos,node_j,iter_j,min_fragment_len,tag)
592 && _cutIsValid(all_node_queues,seq_j_pos,node_j,qiter_j,min_fragment_len,tag))
593 //if(iter_j == all_nodes[seq_j_pos].end())
594 {
595 all_node_queues[seq_j_pos].insert(node_j);
596 }
597
598 ++segment_it;
599 }
600
601 }
602 ++queueIt;
603 }
604 all_node_queues[seq_i_pos].clear();
605 }
606 unsigned i;
607 for(i = 0; i < numSequences; ++i)
608 {
609 queueIt = all_node_queues[i].begin();
610 if (queueIt != all_node_queues[i].end())
611 break;
612 }
613 if(i==numSequences)
614 done=true;
615 }
616 _addAnnotationCuts(all_nodes,alis,gs,pms,seq,seq_map,annotation,min_fragment_len,tag);
617
618 // finish1 = clock();
619 // duration = (double)(finish1 - start) / CLOCKS_PER_SEC;
620 //std::cout << "\ntook " << duration << " seconds.\n";
621 //for(int seq_i = 0; seq_i < length(seq); ++seq_i)
622 //{
623 // typename std::set<TValue>::iterator it = all_nodes[seq_i].begin();
624 // typename std::set<TValue>::iterator end_it = all_nodes[seq_i].end();
625 //
626 // while(it != end_it)
627 // {
628 // std::cout << *it << ",";
629 // ++it;
630 // }
631 // std::cout << "\n";
632 //}
633 //std::cout <<"building tree..."<<std::flush;
634
635 #ifdef SEQAN_TCOFFEE_DEBUG
636 std::cout << std::setw(30) << std::left << "Segment-match refinement:" << std::setw(10) << std::right << sysTime() - refinementTime << " s" << std::endl;
637
638 double buildGraphTime = sysTime();
639 #endif
640
641 ////////////////////////////////////////////////////////////////
642 //build refined alignment graph
643 _makeAlignmentGraphFromRefinedSegments(all_nodes,alis,score_type,seq,seq_map,ali_graph,tag,annotation);
644
645 #ifdef SEQAN_TCOFFEE_DEBUG
646 std::cout << std::setw(30) << std::left << "Build alignment graph:" << std::setw(10) << std::right << sysTime() - buildGraphTime << " s" << std::endl;
647 #endif
648
649 }
650
651
652 ///////WRAPPERS
653
654 //exact refinement, score type given
655 template<typename TAlignmentString, typename TScoreValue,typename TScoreSpec,typename TOutGraph, typename TSequence, typename TSetSpec>
656 void
matchRefinement(TAlignmentString & alis,StringSet<TSequence,TSetSpec> & seq,Score<TScoreValue,TScoreSpec> & score_type,TOutGraph & ali_graph)657 matchRefinement(TAlignmentString & alis,
658 StringSet<TSequence, TSetSpec> & seq,
659 Score<TScoreValue,TScoreSpec> & score_type,
660 TOutGraph & ali_graph)
661 {
662 //min_fragment_len = 1 ==> Exact cutting
663 bool anno = false;
664 matchRefinement(alis,seq,score_type,ali_graph,1,anno,ExactRefinement());
665 }
666
667
668
669 //exact refinement, score type not given
670 template<typename TFragmentString, typename TOutGraph, typename TSequence, typename TSetSpec>
671 void
matchRefinement(TFragmentString & matches,StringSet<TSequence,TSetSpec> & strSet,TOutGraph & ali_graph)672 matchRefinement(TFragmentString & matches,
673 StringSet<TSequence, TSetSpec> & strSet,
674 TOutGraph & ali_graph)
675 {
676 typename Cargo<TOutGraph>::Type fake_score = 1;
677 bool anno = false;
678 matchRefinement(matches,strSet,fake_score,ali_graph,1,anno,ExactRefinement());
679 }
680
681 } // namespace seqan
682
683 #endif // #ifndef SEQAN_INCLUDE_SEQAN_GRAPH_ALGORITHM_REFINE_EXACT_ITERATIVE_H_
684