1 // ==========================================================================
2 // SeqAn - The Library for Sequence Analysis
3 // ==========================================================================
4 // Copyright (c) 2006-2010, Knut Reinert, FU Berlin
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are met:
9 //
10 // * Redistributions of source code must retain the above copyright
11 // notice, this list of conditions and the following disclaimer.
12 // * Redistributions in binary form must reproduce the above copyright
13 // notice, this list of conditions and the following disclaimer in the
14 // documentation and/or other materials provided with the distribution.
15 // * Neither the name of Knut Reinert or the FU Berlin nor the names of
16 // its contributors may be used to endorse or promote products derived
17 // from this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29 // DAMAGE.
30 //
31 // ==========================================================================
32
33 #ifndef SEQAN_HEADER_FIND_MOTIF_PMS1_H
34 #define SEQAN_HEADER_FIND_MOTIF_PMS1_H
35
36 namespace SEQAN_NAMESPACE_MAIN
37 {
38
39 //////////////////////////////////////////////////////////////////////////////
40 // Pms1
41 //////////////////////////////////////////////////////////////////////////////
42
43 /**
44 .Spec.Pms1:
45 ..summary: Represents the Pms1 algorithm developed by Rajasekaran et al.
46 ..general:Class.MotifFinder
47 ..cat:Motif Search
48 ..signature:MotifFinder<TValue, Pms1>
49 ..param.TValue:The type of sequences to be analyzed.
50 ...type:Spec.Dna
51 ...type:Spec.AminoAcid
52 ..remarks:The exact @Spec.Pms1|Pms1 algorithm@ (Planted Motif Search 1) searches in the space
53 of possible motifs such as @Spec.EPatternBranching@. The procedure of the algorithm
54 is quite simple. For every l-mer in each input sequence the algorithm generates
55 all possible length-l patterns in the Hamming distance $d$-neighborhood of $x$.
56 The neighbor sets for each sequence are then intersected so that at the end of the process
57 we get a group of l-mers or a single l-mer that occur(s) in each input sequence with $d$
58 substitutions.
59 ..include:seqan/find_motif.h
60 */
61
62 ///.Class.MotifFinder.param.TSpec.type:Spec.Pms1
63
64 struct Pms1_;
65 typedef Tag<Pms1_> const Pms1;
66
67 //////////////////////////////////////////////////////////////////////////////
68 // MotifFinder - Pms1 Spec
69 //
70 // t:=dataset size (number of sequences)
71 // n:=average sequence size
72 // l:=motif size
73 // d:=number of substitutions
74 //////////////////////////////////////////////////////////////////////////////
75
76 template <typename TValue>
77 class MotifFinder<TValue, Pms1>
78 {
79 //_________________________________________________________________________________
80
81 public:
82 typedef unsigned int TSize;
83 typedef String<TValue> TString;
84 typedef String<TString> TStrings;
85
86 TSize motif_size;
87 TSize num_of_substitutions;
88 bool has_exact_substitutions;
89 TStrings set_of_motifs; // result set
90
91 //_________________________________________________________________________________
92
MotifFinder()93 MotifFinder()
94 {
95 SEQAN_CHECKPOINT;
96 }
MotifFinder(TSize const & l_,TSize const & d_,bool const & is_exact_)97 MotifFinder(TSize const & l_, TSize const & d_, bool const & is_exact_):
98 motif_size(l_),
99 num_of_substitutions(d_),
100 has_exact_substitutions(is_exact_)
101 {
102 SEQAN_CHECKPOINT;
103 }
MotifFinder(MotifFinder const & other_)104 MotifFinder(MotifFinder const & other_):
105 motif_size(other_.motif_size),
106 num_of_substitutions(other_.num_of_substitutions),
107 has_exact_substitutions(other_.has_exact_substitutions)
108 {
109 SEQAN_CHECKPOINT;
110 }
~MotifFinder()111 ~MotifFinder()
112 {
113 SEQAN_CHECKPOINT;
114 }
115
116 MotifFinder const &
117 operator = (MotifFinder const & other_)
118 {
119 SEQAN_CHECKPOINT;
120 if(this!=&other_)
121 {
122 motif_size = other_.motif_size;
123 num_of_substitutions = other_.num_of_substitutions;
124 has_exact_substitutions = other_.has_exact_substitutions;
125 }
126
127 return *this;
128 }
129 //_________________________________________________________________________________
130
131 }; // class MotifFinder<TValue, Pms1>
132
133 //////////////////////////////////////////////////////////////////////////////
134 // Functions
135 //////////////////////////////////////////////////////////////////////////////
136
137 template<typename TSeqType, typename TStrings, typename TModel>
138 inline void
findMotif(MotifFinder<TSeqType,Pms1> & finder,TStrings & dataset,TModel seq_model)139 findMotif(MotifFinder<TSeqType, Pms1> & finder,
140 TStrings & dataset,
141 TModel seq_model)
142 {
143 SEQAN_CHECKPOINT;
144 pms1(finder.set_of_motifs,
145 dataset,
146 finder.motif_size,
147 finder.num_of_substitutions,
148 finder.has_exact_substitutions,
149 seq_model);
150 }
151
152 //////////////////////////////////////////////////////////////////////////////
153
154 /*
155 .Function.pms1:
156 ..summary:Represents the Pms1 algorithm.
157 ..cat:Motif Search
158 ..signature:pms1(result_set,dataset,l,d,is_exact,h,model_type)
159 ..param.result_set:The result_set object.
160 ..param.dataset:The dataset object representing the input sequences.
161 ...type:Class.String
162 ...signature:String<TString>
163 ...param.TString:A @Class.String@ type
164 ....type:Class.String
165 ..param.l:The size of the motif.
166 ..param.d:The number of substitutions.
167 ..param.is_exact:The size of Hamming distance.
168 ...type:$bool$
169 ..param.model_type:The model_type object.
170 ...type:Tag.Oops
171 ...type:Tag.Omops
172 ...type:Tag.Zoops
173 ...type:Tag.Tcm
174 ..remarks:The Pms1 algorithm is able to run in Oops, Omops, Zoops and Tcm mode.
175 ..remarks:The resulted motif candidates found by the Pms1 algorithm will be stored in the result_set object.
176 ..include:seqan/find_motif.h
177 */
178
179 //////////////////////////////////////////////////////////////////////////////
180 // Oops model
181 //////////////////////////////////////////////////////////////////////////////
182
183 template<typename TStrings, typename TType>
184 void
pms1(TStrings & result_set,TStrings & dataset,TType const & l,TType const & d,bool const & is_exact,Oops const &)185 pms1(TStrings & result_set,
186 TStrings & dataset,
187 TType const & l,
188 TType const & d,
189 bool const & is_exact,
190 Oops const & /*model_type*/)
191 {
192 SEQAN_CHECKPOINT;
193
194 typedef typename Value<TStrings>::Type TString;
195 typedef typename Value<TString>::Type TValue;
196 Shape<TValue> shape(l);
197 // ----------------------------------------------------------------------------
198 // STEP 1:
199 // processing the first sequence.
200 // ----------------------------------------------------------------------------
201 typename Iterator<TStrings>::Type ds_iter = begin(dataset);
202 TString l_mer;
203 std::vector<int> var_of_l_mer, var_all;
204
205 typename Size<TString>::Type seq_len = length(*ds_iter);
206 typename Iterator<TString>::Type seq_iter = begin(*ds_iter);
207 typename Iterator<TString>::Type seq_end = begin(*ds_iter)+(seq_len-l+1);
208 while(seq_iter!=seq_end)
209 {
210 createDVariants(var_of_l_mer, seq_iter, l, d, is_exact, shape);
211 std::copy(var_of_l_mer.begin(), var_of_l_mer.end(), std::back_inserter(var_all));
212 var_of_l_mer.clear();
213 ++seq_iter;
214 }
215
216 // sort var_all and filter d-variants that occur exactly once in the sequence
217 std::sort(var_all.begin(), var_all.end());
218 std::vector<int>::iterator vect_iter = var_all.begin();
219 std::vector<int>::iterator vect_end = var_all.end();
220 std::vector<int> var_unique;
221 while(vect_iter!=vect_end)
222 {
223 std::vector<int>::iterator upper_iter =
224 std::upper_bound(var_all.begin(),var_all.end(),*vect_iter);
225 int count =
226 std::count(vect_iter, upper_iter, *vect_iter);
227 if(count==1)
228 {
229 var_unique.push_back(*vect_iter);
230 }
231 vect_iter+=count;
232 }
233 var_all.clear();
234 // ----------------------------------------------------------------------------
235 // STEP 2:
236 // processing the remaining input sequences
237 // ----------------------------------------------------------------------------
238 std::vector<int> var_unique2, result;
239 ++ds_iter;
240 for(; !atEnd(ds_iter, dataset); goNext(ds_iter))
241 {
242 seq_len = length(*ds_iter);
243 seq_iter = begin(*ds_iter);
244 seq_end = begin(*ds_iter)+(seq_len-l+1);
245 while(seq_iter!=seq_end)
246 {
247 createDVariants(var_of_l_mer, seq_iter, l, d, is_exact, shape);
248 std::copy(var_of_l_mer.begin(), var_of_l_mer.end(), std::back_inserter(var_all));
249 var_of_l_mer.clear();
250 ++seq_iter;
251 }
252 // sort var_all and filter d-variants that occur exactly once in the sequence
253 std::sort(var_all.begin(), var_all.end());
254 vect_iter = var_all.begin();
255 vect_end = var_all.end();
256 while(vect_iter!=vect_end)
257 {
258 std::vector<int>::iterator upper_iter =
259 std::upper_bound(var_all.begin(),var_all.end(),*vect_iter);
260 int count =
261 std::count_if(vect_iter, upper_iter, std::bind2nd(std::equal_to<int>(), *vect_iter));
262 if(count==1)
263 {
264 var_unique2.push_back(*vect_iter);
265 }
266 vect_iter+=count;
267 }
268 var_all.clear();
269
270 //create an insert_iterator for int-vector result
271 std::insert_iterator< std::vector<int> > res_ins(result, result.begin());
272 std::set_intersection(var_unique.begin(), var_unique.end(),
273 var_unique2.begin(), var_unique2.end(), res_ins);
274 var_unique2.clear();
275 var_unique.clear();
276 var_unique.resize(result.size());
277 std::copy(result.begin(),result.end(),var_unique.begin());
278 result.clear();
279 }
280 // ----------------------------------------------------------------------------
281 // STEP 3:
282 // insert the relevant d-variants into result set
283 // ----------------------------------------------------------------------------
284 resize(result_set, var_unique.size());
285 vect_iter = var_unique.begin();
286 vect_end = var_unique.end();
287 unsigned int variant_nr =0;
288 while(vect_iter!=vect_end)
289 {
290 l_mer = inverseHash<TValue>(*vect_iter, ValueSize<TValue>::VALUE, l);
291 result_set[variant_nr] = l_mer;
292 ++variant_nr;
293 ++vect_iter;
294 }
295 }
296
297 //////////////////////////////////////////////////////////////////////////////
298 // Omops model
299 //////////////////////////////////////////////////////////////////////////////
300
301 template<typename TStrings, typename TType>
302 void
pms1(TStrings & result_set,TStrings & dataset,TType const & l,TType const & d,bool const & is_exact,Omops const &)303 pms1(TStrings & result_set,
304 TStrings & dataset,
305 TType const & l,
306 TType const & d,
307 bool const & is_exact,
308 Omops const & /*model_type*/)
309 {
310 SEQAN_CHECKPOINT;
311
312 typedef typename Value<TStrings>::Type TString;
313 typedef typename Value<TString>::Type TValue;
314 Shape<TValue> shape(l);
315 // ----------------------------------------------------------------------------
316 // STEP 1:
317 // processing the first sequence.
318 // ----------------------------------------------------------------------------
319 typename Iterator<TStrings>::Type ds_iter = begin(dataset);
320 TString l_mer;
321 std::vector<int> var_of_l_mer, var_all;
322
323 typename Size<TString>::Type seq_len = length(*ds_iter);
324 typename Iterator<TString>::Type seq_iter = begin(*ds_iter);
325 typename Iterator<TString>::Type seq_end = begin(*ds_iter)+(seq_len-l+1);
326 while(seq_iter!=seq_end)
327 {
328 createDVariants(var_of_l_mer, seq_iter, l, d, is_exact, shape);
329 std::copy(var_of_l_mer.begin(), var_of_l_mer.end(), std::back_inserter(var_all));
330 var_of_l_mer.clear();
331 ++seq_iter;
332 }
333 // sort var_all and filter d-variants that occur exactly once in the sequence
334 std::sort(var_all.begin(), var_all.end());
335 std::vector<int> var_unique;
336 std::unique_copy(var_all.begin(), var_all.end(), std::back_inserter(var_unique));
337 var_all.clear();
338 // ----------------------------------------------------------------------------
339 // STEP 2:
340 // processing remaining input sequences
341 // ----------------------------------------------------------------------------
342 std::vector<int> var_unique2, result;
343 ++ds_iter;
344 for(; !atEnd(ds_iter, dataset); goNext(ds_iter))
345 {
346 seq_len = length(*ds_iter);
347 seq_iter = begin(*ds_iter);
348 while(seq_iter!=begin(*ds_iter)+(seq_len-l+1))
349 {
350 createDVariants(var_of_l_mer, seq_iter, l, d, is_exact, shape);
351 std::copy(var_of_l_mer.begin(), var_of_l_mer.end(), std::back_inserter(var_all));
352 var_of_l_mer.clear();
353 ++seq_iter;
354 }
355 // sort var_all and eliminate duplicates
356 std::sort(var_all.begin(), var_all.end());
357 std::unique_copy(var_all.begin(), var_all.end(), std::back_inserter(var_unique2));
358 var_all.clear();
359
360 //create an insert_iterator for int-vector result
361 std::insert_iterator< std::vector<int> > res_ins(result, result.begin());
362 std::set_intersection(var_unique.begin(), var_unique.end(),
363 var_unique2.begin(), var_unique2.end(), res_ins);
364 var_unique2.clear();
365 var_unique.clear();
366 var_unique.resize(result.size());
367 std::copy(result.begin(),result.end(),var_unique.begin());
368 result.clear();
369 }
370 // ----------------------------------------------------------------------------
371 // STEP 3:
372 // insert relevant d-variants into result set
373 // ----------------------------------------------------------------------------
374 resize(result_set, var_unique.size());
375 typename std::vector<int>::iterator vect_iter = var_unique.begin();
376 typename std::vector<int>::iterator vect_end = var_unique.end();
377 unsigned int variant_nr =0;
378 while(vect_iter!=vect_end)
379 {
380 l_mer = inverseHash<TValue>(*vect_iter, ValueSize<TValue>::VALUE, l);
381 result_set[variant_nr] = l_mer;
382 ++variant_nr;
383 ++vect_iter;
384 }
385 }
386
387 //////////////////////////////////////////////////////////////////////////////
388 // Zoops model
389 //////////////////////////////////////////////////////////////////////////////
390
391 template<typename TStrings, typename TType>
392 void
pms1(TStrings & result_set,TStrings & dataset,TType const & l,TType const & d,bool const & is_exact,Zoops const & model_type)393 pms1(TStrings & result_set,
394 TStrings & dataset,
395 TType const & l,
396 TType const & d,
397 bool const & is_exact,
398 Zoops const & model_type)
399 {
400 SEQAN_CHECKPOINT;
401
402 typedef typename Value<TStrings>::Type TString;
403 typedef typename Value<TString>::Type TValue;
404 Shape<TValue> shape(l);
405
406 // ----------------------------------------------------------------------------
407 // STEP 1:
408 // building d-variants for all l-mers from the input sequences and storing their
409 // corresponding hash-values (values of variants which occur exactly once
410 // in an input sequence) in an int-string 'var_ar'.
411 // index_mat contains the variant's hash values for each sequence.
412 // ----------------------------------------------------------------------------
413 typename Size<TStrings>::Type t = length(dataset);
414 typename Iterator<TStrings>::Type ds_iter = begin(dataset);
415 TString l_mer;
416 std::vector<int> var_of_l_mer, var_all, result_vect;
417 for(; !atEnd(ds_iter, dataset); goNext(ds_iter))
418 {
419 typename Size<TString>::Type seq_len = length(*ds_iter);
420 typename Iterator<TString>::Type seq_iter = begin(*ds_iter);
421 typename Iterator<TString>::Type seq_end = begin(*ds_iter)+(seq_len-l+1);
422 while(seq_iter!=seq_end)
423 {
424 createDVariants(var_of_l_mer, seq_iter, l, d, is_exact, shape);
425 std::copy(var_of_l_mer.begin(), var_of_l_mer.end(), std::back_inserter(var_all));
426 var_of_l_mer.clear();
427 ++seq_iter;
428 }
429 // sort var_all and filter d-variants that occur exactly once in the sequence
430 std::sort(var_all.begin(), var_all.end());
431 std::vector<int>::iterator vect_iter = var_all.begin();
432 std::vector<int>::iterator vect_end = var_all.end();
433 std::vector<int> var_unique;
434 while(vect_iter!=vect_end)
435 {
436 std::vector<int>::iterator upper_iter =
437 std::upper_bound(var_all.begin(),var_all.end(),*vect_iter);
438 int count =
439 std::count(vect_iter, upper_iter, *vect_iter);
440 if(count==1)
441 {
442 var_unique.push_back(*vect_iter);
443 }
444 vect_iter+=count;
445 }
446 var_all.clear();
447
448 //create an insert_iterator for int-vector var_all
449 std::insert_iterator< std::vector<int> > res_ins(var_all, var_all.begin());
450 std::merge(result_vect.begin(), result_vect.end(),
451 var_unique.begin(), var_unique.end(), res_ins);
452 var_unique.clear();
453 result_vect.clear();
454 result_vect.resize(var_all.size());
455 std::copy(var_all.begin(),var_all.end(),result_vect.begin());
456 var_all.clear();
457 }
458 // ----------------------------------------------------------------------------
459 // STEP 2:
460 // count votes of l-mers (d-variants) and insert relevant d-variants into result set
461 // ----------------------------------------------------------------------------
462 int lower_limit = (int) floor(t*(model_type.threshold)+0.5);
463 // count votes of relevant l-mers (d-variants)
464 std::vector<int>::iterator iter = result_vect.begin();
465 std::vector<int>::iterator end_iter = result_vect.end();
466 unsigned int num_of_results = 0;
467 while(iter!=end_iter)
468 {
469 std::vector<int>::iterator upper_iter =
470 std::upper_bound(result_vect.begin(),result_vect.end(),*iter);
471 int count =
472 std::count(iter, upper_iter, *iter);
473 if( count>=lower_limit )
474 {
475 var_all.push_back(*iter);
476 ++num_of_results;
477 }
478 iter+=count;
479 }
480 result_vect.clear();
481
482 resize(result_set, num_of_results);
483 num_of_results = 0;
484 iter = var_all.begin();
485 end_iter = var_all.end();
486 while(iter!=end_iter)
487 {
488 l_mer = inverseHash<TValue>(*iter, ValueSize<TValue>::VALUE, l);
489 result_set[num_of_results] = l_mer;
490 ++num_of_results;
491 ++iter;
492 }
493 var_all.clear();
494 }
495
496 //////////////////////////////////////////////////////////////////////////////
497 // Tcm model
498 //////////////////////////////////////////////////////////////////////////////
499
500 template<typename TStrings, typename TType>
501 void
pms1(TStrings & result_set,TStrings & dataset,TType const & l,TType const & d,bool const & is_exact,Tcm const & model_type)502 pms1(TStrings & result_set,
503 TStrings & dataset,
504 TType const & l,
505 TType const & d,
506 bool const & is_exact,
507 Tcm const & model_type)
508 {
509 SEQAN_CHECKPOINT;
510
511 typedef typename Value<TStrings>::Type TString;
512 typedef typename Value<TString>::Type TValue;
513 Shape<TValue> shape(l);
514 // ----------------------------------------------------------------------------
515 // STEP 1:
516 // building d-variants for all l-mers from the input sequences and storing their
517 // corresponding hash-values (values of variants which occur exactly once
518 // in an input sequence) in an int-string 'var_ar'.
519 // index_mat contains the variant's hash values for each sequence.
520 // ----------------------------------------------------------------------------
521 typename Size<TStrings>::Type t = length(dataset);
522 typename Iterator<TStrings>::Type ds_iter = begin(dataset);
523 TString l_mer;
524 std::vector<int> var_of_l_mer, var_all, result_vect, unique_vect;
525 for(; !atEnd(ds_iter, dataset); goNext(ds_iter))
526 {
527 typename Size<TString>::Type seq_len = length(*ds_iter);
528 typename Iterator<TString>::Type seq_iter = begin(*ds_iter);
529 while(seq_iter!=begin(*ds_iter)+(seq_len-l+1))
530 {
531 createDVariants(var_of_l_mer, seq_iter, l, d, is_exact, shape);
532 std::copy(var_of_l_mer.begin(), var_of_l_mer.end(), std::back_inserter(var_all));
533 var_of_l_mer.clear();
534 ++seq_iter;
535 }
536
537 // sort var_all and filter d-variants that occur exactly once in the sequence
538 std::sort(var_all.begin(), var_all.end());
539 std::insert_iterator< std::vector<int> > res_ins1(unique_vect, unique_vect.begin());
540 std::unique_copy(var_all.begin(), var_all.end(), res_ins1);
541 var_all.clear();
542
543 //create an insert_iterator for int-vector var_all
544 std::insert_iterator< std::vector<int> > res_ins2(var_all, var_all.begin());
545 std::merge(result_vect.begin(), result_vect.end(),
546 unique_vect.begin(), unique_vect.end(), res_ins2);
547 unique_vect.clear();
548 result_vect.clear();
549 result_vect.resize(var_all.size());
550 std::copy(var_all.begin(),var_all.end(),result_vect.begin());
551 var_all.clear();
552 }
553 // ----------------------------------------------------------------------------
554 // STEP 2:
555 // count votes of l-mers (d-variants) and insert relevant d-variants into result set
556 // ----------------------------------------------------------------------------
557 int lower_limit = (int) floor(t*(model_type.threshold)+0.5);
558
559 // count votes of relevant l-mers (d-variants)
560 std::vector<int>::iterator iter = result_vect.begin();
561 std::vector<int>::iterator end_iter = result_vect.end();
562 unsigned int num_of_results = 0;
563 while(iter!=end_iter)
564 {
565 std::vector<int>::iterator upper_iter =
566 std::upper_bound(result_vect.begin(),result_vect.end(),*iter);
567 int count =
568 std::count(iter, upper_iter, *iter);
569 if( count>=lower_limit )
570 {
571 var_all.push_back(*iter);
572 ++num_of_results;
573 }
574 iter+=count;
575 }
576 result_vect.clear();
577
578 resize(result_set, num_of_results);
579 num_of_results = 0;
580 iter = var_all.begin();
581 end_iter = var_all.end();
582 while(iter!=end_iter)
583 {
584 l_mer = inverseHash<TValue>(*iter, ValueSize<TValue>::VALUE, l);
585 result_set[num_of_results] = l_mer;
586 ++num_of_results;
587 ++iter;
588 }
589 var_all.clear();
590 }
591
592 //////////////////////////////////////////////////////////////////////////////
593 //Subfunctions
594 //////////////////////////////////////////////////////////////////////////////
595
596 /*
597 .Function.createDVariants:
598 ..summary:Creates the d-variants of a given l-mer and computes their hash-values which will be
599 finally stored in array 'variants'.
600 ..cat:Motif Search
601 ..signature:createDVariants(variants,l_mer_begin,l,d,is_exact,shape)
602 ..param.variants:The set of d-variants of a given l-mer.
603 ...type:String<int>
604 ...remarks:The string holds the hash-values of each pattern being in the d-vicinity of a given l-mer.
605 ..param.l_mer_begin:An iterator pointing to the beginning of a given l-mer pattern.
606 ...type:Concept.Iterator Iterator
607 ....remarks:Standard conform iterator
608 ...type:Shortcut.DnaIterator
609 ....remarks:Iterator for @Shortcut.DnaString@ (a string of @Spec.Dna@).
610 ....see:Shortcut.DnaIterator
611 ...type:Shortcut.PeptideIterator
612 ....remarks:Iterator for @Shortcut.Peptide@ (a string of @Spec.AminoAcid@).
613 ....see:Shortcut.PeptideIterator
614 ..param.l:The size of the motif.
615 ..param.d:The number of substitutions.
616 ..param.is_exact:The size of Hamming distance
617 ...type:$bool$
618 ..param.shape:The @Class.Shape@ object.
619 ...type:Class.Shape
620 ...signature:Shape<TValue, SimpleShape>
621 ...remarks:Is used for computing the hash-value of each variant.
622 ..remarks: #d-variants =
623 - sum(i,0,d,binomialCoefficient(l,i)*(alp_size-1)^i), if is_exact=false
624 - binomialCoefficient(l.d)*(alp_size-1)^d), else
625 ..include:seqan/find_motif.h
626 */
627
628 template<typename TIntVect, typename TIterString, typename TType, typename TShape>
629 void
createDVariants(TIntVect & variants,TIterString l_mer_begin,TType const & l,TType const & d,bool is_exact,TShape & shape)630 createDVariants(TIntVect & variants,
631 TIterString l_mer_begin,
632 TType const & l,
633 TType const & d,
634 bool is_exact,
635 TShape & shape)
636 {
637 SEQAN_CHECKPOINT;
638
639 typedef String<int> TIntArray;
640 String<TIntArray> bitsets;
641 if(is_exact)
642 {
643 // get all variants of bitsets consisting of d zeros(pos of mismatches)
644 // and (l-d) ones
645 _getVariantsOfBitset(bitsets, l, d);
646
647 typename Iterator< String<TIntArray> >::Type bitsets_iter = begin(bitsets);
648 typename Iterator< String<TIntArray> >::Type bitsets_end = end(bitsets);
649 while(bitsets_iter!=bitsets_end)
650 {
651 TIntVect hash_val_of_variants;
652 _buildVariants(hash_val_of_variants, l_mer_begin, d, *bitsets_iter, shape);
653 std::copy(hash_val_of_variants.begin(), hash_val_of_variants.end(), std::back_inserter(variants));
654 ++bitsets_iter;
655 }
656 clear(bitsets);
657 }
658 else
659 {
660 for(unsigned int i=0; i<=d; ++i)
661 {
662 _getVariantsOfBitset(bitsets, l, i);
663 typename Iterator< String<TIntArray> >::Type bitsets_iter = begin(bitsets);
664 typename Iterator< String<TIntArray> >::Type bitsets_end = end(bitsets);
665 while(bitsets_iter!=bitsets_end)
666 {
667 TIntVect hash_val_of_variants;
668 _buildVariants(hash_val_of_variants, l_mer_begin, i, *bitsets_iter, shape);
669 std::copy(hash_val_of_variants.begin(), hash_val_of_variants.end(), std::back_inserter(variants));
670 ++bitsets_iter;
671 }
672 clear(bitsets);
673 }
674 }
675 }
676
677 //////////////////////////////////////////////////////////////////////////////
678
679 /*
680 .Function._getVariantsOfBitset:
681 ..summary:Builds all variants of bitsets consisting of d zeros & (l-d) ones which
682 will be stored in an seqan::String.
683 ..cat:Motif Search
684 ..signature:_getVariantsOfBitset(bitsets,l,d)
685 ..param.bitsets:The collection of bitsets.
686 ...type:Class.String
687 ...signature:String< String<int> >
688 ...remarks:Bitsets contains different length-l arrays consisting of d zero- (position of mismatches)
689 and (l-d) one-values
690 ..param.l:The size of the motif.
691 ..param.d:The number of substitutions.
692 ..remarks: #bitsets = binomialCoefficient(l,d).
693 ..include:seqan/find_motif.h
694 */
695
696 template<typename TStrings, typename TType>
697 void
_getVariantsOfBitset(TStrings & bitsets,TType const & l,TType const & d)698 _getVariantsOfBitset(TStrings & bitsets,
699 TType const & l,
700 TType const & d)
701 {
702 SEQAN_CHECKPOINT;
703
704 unsigned int num_of_bitsets = binomialCoefficient(l, d);
705 resize(bitsets, num_of_bitsets);
706
707 typename Value<TStrings>::Type bitset;
708 resize(bitset, l);
709 std::fill(begin(bitset), end(bitset), 1);
710 std::fill_n(begin(bitset), d, 0);
711
712 for(typename Position<TStrings>::Type i=0;i<num_of_bitsets; ++i)
713 {
714 bitsets[i] = bitset;
715 std::next_permutation(begin(bitset), end(bitset));
716 }
717 }
718
719 //////////////////////////////////////////////////////////////////////////////
720
721 /*
722 .Function._buildVariants:
723 ..summary:Builds all d-variants of a given l-mer given the position(s) of mismatch(es) and
724 then computes the hash-values for each variant which will be finally stored
725 in an int-vector variants.
726 ..cat:Motif Search
727 ..signature:_buildVariants(variants,l_mer_begin,d,bitset,shape)
728 ..param.variants:The int-vector of d-variants of a given l-mer
729 ...remarks:The vector holds the hash-values of each d-variant.
730 ...type:vector<int>
731 ..param.l_mer_begin:An iterator pointing to the beginning of a given l-mer pattern.
732 ...type:Concept.Iterator Iterator
733 ....remarks:Standard conform iterator
734 ...type:Shortcut.DnaIterator
735 ....remarks:Iterator for @Shortcut.DnaString@ (a string of @Spec.Dna@).
736 ....see:Shortcut.DnaIterator
737 ...type:Shortcut.PeptideIterator
738 ....remarks:Iterator for @Shortcut.Peptide@ (a string of @Spec.AminoAcid@).
739 ....see:Shortcut.PeptideIterator
740 ..param.d:The number of substitutions.
741 ..param.bitset:The bitset object.
742 ...type:Class.String
743 ...signature:String<int>
744 ...remarks:The length-l int-array consisting of d zeros (for mismatch positions)
745 & (l-d) ones)
746 ..param.shape:The @Class.Shape@ object.
747 ...type:Class.Shape
748 ...signature:Shape<TValue, SimpleShape>
749 ...remarks:Is used for computing the hash-value of each variant.
750 ..remarks: #variants = (alp_size-1)^d for a given bitset, d-value and l-mer.
751 ..include:seqan/find_motif.h
752 */
753
754 template<typename TIntVect, typename TStringIter, typename TType, typename TBitset, typename TValue, typename TSpec>
755 void
_buildVariants(TIntVect & variants,TStringIter l_mer_begin,TType const & d,TBitset const & bitset,Shape<TValue,TSpec> & shape)756 _buildVariants(TIntVect & variants,
757 TStringIter l_mer_begin,
758 TType const & d,
759 TBitset const & bitset,
760 Shape<TValue, TSpec> & shape)
761 {
762 SEQAN_CHECKPOINT;
763
764 typedef String<TValue> TString;
765 typedef typename Position<TString>::Type TPos;
766
767 //build l-mer
768 TString l_mer;
769 resize(l_mer, length(bitset));
770 for(TPos i=0; i<length(l_mer); ++i)
771 {
772 l_mer[i] = *l_mer_begin;
773 ++l_mer_begin;
774 }
775
776 // d-mers
777 unsigned int num_of_d_mers =
778 (unsigned int) pow((double)ValueSize<TValue>::VALUE, (int)d);
779 for(unsigned int i=0; i<num_of_d_mers; ++i)
780 {
781 TString d_mer = inverseHash<TValue>(i, ValueSize<TValue>::VALUE, d);
782 TString variant = l_mer;
783
784 typename Position<TString>::Type pos_of_d_mer = 0;
785 typename Position<TString>::Type pos_of_variant = 0;
786 typename Iterator<TString>::Type iter = begin(variant);
787 typename Iterator<TString>::Type iter_end = end(variant);
788 while(iter!=iter_end)
789 {
790 if( bitset[pos_of_variant]==0)
791 {
792 if(l_mer[pos_of_variant]!=d_mer[pos_of_d_mer])
793 {
794 variant[pos_of_variant] = d_mer[pos_of_d_mer];
795 ++pos_of_d_mer;
796 ++pos_of_variant;
797 ++iter;
798 }
799 else
800 {
801 iter = end(variant);
802 }
803 }
804 else
805 {
806 ++iter;
807 ++pos_of_variant;
808 }
809 }
810 if(pos_of_d_mer==d)
811 {
812 variants.push_back(hash(shape, begin(variant)));
813 }
814 }
815 }
816
817 //////////////////////////////////////////////////////////////////////////////
818
819 }// namespace SEQAN_NAMESPACE_MAIN
820
821 #endif //#ifndef SEQAN_HEADER_...
822
823