1 // ==========================================================================
2 //                 SeqAn - The Library for Sequence Analysis
3 // ==========================================================================
4 // Copyright (c) 2006-2010, Knut Reinert, FU Berlin
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are met:
9 //
10 //     * Redistributions of source code must retain the above copyright
11 //       notice, this list of conditions and the following disclaimer.
12 //     * Redistributions in binary form must reproduce the above copyright
13 //       notice, this list of conditions and the following disclaimer in the
14 //       documentation and/or other materials provided with the distribution.
15 //     * Neither the name of Knut Reinert or the FU Berlin nor the names of
16 //       its contributors may be used to endorse or promote products derived
17 //       from this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29 // DAMAGE.
30 //
31 // ==========================================================================
32 
33 #ifndef SEQAN_HEADER_FIND_MOTIF_PMS1_H
34 #define SEQAN_HEADER_FIND_MOTIF_PMS1_H
35 
36 namespace SEQAN_NAMESPACE_MAIN
37 {
38 
39 //////////////////////////////////////////////////////////////////////////////
40 // Pms1
41 //////////////////////////////////////////////////////////////////////////////
42 
43 /**
44 .Spec.Pms1:
45 ..summary: Represents the Pms1 algorithm developed by Rajasekaran et al.
46 ..general:Class.MotifFinder
47 ..cat:Motif Search
48 ..signature:MotifFinder<TValue, Pms1>
49 ..param.TValue:The type of sequences to be analyzed.
50 ...type:Spec.Dna
51 ...type:Spec.AminoAcid
52 ..remarks:The exact @Spec.Pms1|Pms1 algorithm@ (Planted Motif Search 1) searches in the space
53           of possible motifs such as @Spec.EPatternBranching@. The procedure of the algorithm
54 		  is quite simple. For every l-mer in each input sequence the algorithm generates
55 		  all possible length-l patterns in the Hamming distance $d$-neighborhood of $x$.
56 		  The neighbor sets for each sequence are then intersected so that at the end of the process
57 		  we get a group of l-mers or a single l-mer that occur(s) in each input sequence with $d$
58 		  substitutions.
59 ..include:seqan/find_motif.h
60 */
61 
62 ///.Class.MotifFinder.param.TSpec.type:Spec.Pms1
63 
64 struct Pms1_;
65 typedef Tag<Pms1_> const Pms1;
66 
67 //////////////////////////////////////////////////////////////////////////////
68 // MotifFinder - Pms1 Spec
69 //
70 // t:=dataset size (number of sequences)
71 // n:=average sequence size
72 // l:=motif size
73 // d:=number of substitutions
74 //////////////////////////////////////////////////////////////////////////////
75 
76 template <typename TValue>
77 class MotifFinder<TValue, Pms1>
78 {
79 //_________________________________________________________________________________
80 
81 public:
82 	typedef unsigned int TSize;
83 	typedef String<TValue> TString;
84 	typedef String<TString> TStrings;
85 
86 	TSize motif_size;
87 	TSize num_of_substitutions;
88 	bool has_exact_substitutions;
89 	TStrings set_of_motifs; // result set
90 
91 //_________________________________________________________________________________
92 
MotifFinder()93 	MotifFinder()
94 	{
95     SEQAN_CHECKPOINT;
96 	}
MotifFinder(TSize const & l_,TSize const & d_,bool const & is_exact_)97 	MotifFinder(TSize const & l_, TSize const & d_, bool const & is_exact_):
98 		motif_size(l_),
99 		num_of_substitutions(d_),
100 		has_exact_substitutions(is_exact_)
101 	{
102     SEQAN_CHECKPOINT;
103 	}
MotifFinder(MotifFinder const & other_)104 	MotifFinder(MotifFinder const & other_):
105 		motif_size(other_.motif_size),
106 		num_of_substitutions(other_.num_of_substitutions),
107 		has_exact_substitutions(other_.has_exact_substitutions)
108 	{
109     SEQAN_CHECKPOINT;
110 	}
~MotifFinder()111 	~MotifFinder()
112 	{
113     SEQAN_CHECKPOINT;
114 	}
115 
116 	MotifFinder const &
117 	operator = (MotifFinder const & other_)
118 	{
119     SEQAN_CHECKPOINT;
120 		if(this!=&other_)
121 		{
122 			motif_size = other_.motif_size;
123 			num_of_substitutions = other_.num_of_substitutions;
124 			has_exact_substitutions = other_.has_exact_substitutions;
125 		}
126 
127 		return *this;
128 	}
129 //_________________________________________________________________________________
130 
131 }; // class MotifFinder<TValue, Pms1>
132 
133 //////////////////////////////////////////////////////////////////////////////
134 // Functions
135 //////////////////////////////////////////////////////////////////////////////
136 
137 template<typename TSeqType, typename TStrings, typename TModel>
138 inline void
findMotif(MotifFinder<TSeqType,Pms1> & finder,TStrings & dataset,TModel seq_model)139 findMotif(MotifFinder<TSeqType, Pms1> & finder,
140 		  TStrings & dataset,
141 		  TModel seq_model)
142 {
143     SEQAN_CHECKPOINT;
144 	pms1(finder.set_of_motifs,
145 		 dataset,
146 		 finder.motif_size,
147 		 finder.num_of_substitutions,
148 		 finder.has_exact_substitutions,
149 		 seq_model);
150 }
151 
152 //////////////////////////////////////////////////////////////////////////////
153 
154 /*
155 .Function.pms1:
156 ..summary:Represents the Pms1 algorithm.
157 ..cat:Motif Search
158 ..signature:pms1(result_set,dataset,l,d,is_exact,h,model_type)
159 ..param.result_set:The result_set object.
160 ..param.dataset:The dataset object representing the input sequences.
161 ...type:Class.String
162 ...signature:String<TString>
163 ...param.TString:A @Class.String@ type
164 ....type:Class.String
165 ..param.l:The size of the motif.
166 ..param.d:The number of substitutions.
167 ..param.is_exact:The size of Hamming distance.
168 ...type:$bool$
169 ..param.model_type:The model_type object.
170 ...type:Tag.Oops
171 ...type:Tag.Omops
172 ...type:Tag.Zoops
173 ...type:Tag.Tcm
174 ..remarks:The Pms1 algorithm is able to run in Oops, Omops, Zoops and Tcm mode.
175 ..remarks:The resulted motif candidates found by the Pms1 algorithm will be stored in the result_set object.
176 ..include:seqan/find_motif.h
177 */
178 
179 //////////////////////////////////////////////////////////////////////////////
180 //	Oops model
181 //////////////////////////////////////////////////////////////////////////////
182 
183 template<typename TStrings, typename TType>
184 void
pms1(TStrings & result_set,TStrings & dataset,TType const & l,TType const & d,bool const & is_exact,Oops const &)185 pms1(TStrings & result_set,
186 	 TStrings & dataset,
187 	 TType const & l,
188 	 TType const & d,
189 	 bool const & is_exact,
190 	 Oops const & /*model_type*/)
191 {
192     SEQAN_CHECKPOINT;
193 
194 	typedef typename Value<TStrings>::Type TString;
195 	typedef typename Value<TString>::Type TValue;
196 	Shape<TValue> shape(l);
197 	// ----------------------------------------------------------------------------
198 	// STEP 1:
199 	// processing the first sequence.
200 	// ----------------------------------------------------------------------------
201 	typename Iterator<TStrings>::Type ds_iter = begin(dataset);
202 	TString l_mer;
203 	std::vector<int> var_of_l_mer, var_all;
204 
205 	typename Size<TString>::Type seq_len = length(*ds_iter);
206 	typename Iterator<TString>::Type seq_iter = begin(*ds_iter);
207 	typename Iterator<TString>::Type seq_end = begin(*ds_iter)+(seq_len-l+1);
208 	while(seq_iter!=seq_end)
209 	{
210 		createDVariants(var_of_l_mer, seq_iter, l, d, is_exact, shape);
211 		std::copy(var_of_l_mer.begin(), var_of_l_mer.end(), std::back_inserter(var_all));
212 		var_of_l_mer.clear();
213 		++seq_iter;
214 	}
215 
216 	// sort var_all and filter d-variants that occur exactly once in the sequence
217 	std::sort(var_all.begin(), var_all.end());
218 	std::vector<int>::iterator vect_iter = var_all.begin();
219 	std::vector<int>::iterator vect_end = var_all.end();
220 	std::vector<int> var_unique;
221 	while(vect_iter!=vect_end)
222 	{
223 		std::vector<int>::iterator upper_iter =
224 			std::upper_bound(var_all.begin(),var_all.end(),*vect_iter);
225 		int count =
226 			std::count(vect_iter, upper_iter, *vect_iter);
227 		if(count==1)
228 		{
229 			var_unique.push_back(*vect_iter);
230 		}
231 		vect_iter+=count;
232 	}
233 	var_all.clear();
234 	// ----------------------------------------------------------------------------
235 	// STEP 2:
236 	// processing the remaining input sequences
237 	// ----------------------------------------------------------------------------
238 	std::vector<int> var_unique2, result;
239 	++ds_iter;
240 	for(; !atEnd(ds_iter, dataset); goNext(ds_iter))
241 	{
242 		seq_len = length(*ds_iter);
243 		seq_iter = begin(*ds_iter);
244 		seq_end = begin(*ds_iter)+(seq_len-l+1);
245 		while(seq_iter!=seq_end)
246 		{
247 			createDVariants(var_of_l_mer, seq_iter, l, d, is_exact, shape);
248 			std::copy(var_of_l_mer.begin(), var_of_l_mer.end(), std::back_inserter(var_all));
249 			var_of_l_mer.clear();
250 			++seq_iter;
251 		}
252 		// sort var_all and filter d-variants that occur exactly once in the sequence
253 		std::sort(var_all.begin(), var_all.end());
254 		vect_iter = var_all.begin();
255 		vect_end = var_all.end();
256 		while(vect_iter!=vect_end)
257 		{
258 			std::vector<int>::iterator upper_iter =
259 				std::upper_bound(var_all.begin(),var_all.end(),*vect_iter);
260 			int count =
261 				std::count_if(vect_iter, upper_iter, std::bind2nd(std::equal_to<int>(), *vect_iter));
262 			if(count==1)
263 			{
264 				var_unique2.push_back(*vect_iter);
265 			}
266 			vect_iter+=count;
267 		}
268 		var_all.clear();
269 
270 		//create an insert_iterator for int-vector result
271 		std::insert_iterator< std::vector<int> > res_ins(result, result.begin());
272 		std::set_intersection(var_unique.begin(), var_unique.end(),
273 							  var_unique2.begin(), var_unique2.end(), res_ins);
274 		var_unique2.clear();
275 		var_unique.clear();
276 		var_unique.resize(result.size());
277 		std::copy(result.begin(),result.end(),var_unique.begin());
278 		result.clear();
279 	}
280 	// ----------------------------------------------------------------------------
281 	// STEP 3:
282 	// insert the relevant d-variants into result set
283 	// ----------------------------------------------------------------------------
284 	resize(result_set, var_unique.size());
285 	vect_iter = var_unique.begin();
286 	vect_end = var_unique.end();
287 	unsigned int variant_nr =0;
288 	while(vect_iter!=vect_end)
289 	{
290 		l_mer = inverseHash<TValue>(*vect_iter, ValueSize<TValue>::VALUE, l);
291 		result_set[variant_nr] = l_mer;
292 		++variant_nr;
293 		++vect_iter;
294 	}
295 }
296 
297 //////////////////////////////////////////////////////////////////////////////
298 //	Omops model
299 //////////////////////////////////////////////////////////////////////////////
300 
301 template<typename TStrings, typename TType>
302 void
pms1(TStrings & result_set,TStrings & dataset,TType const & l,TType const & d,bool const & is_exact,Omops const &)303 pms1(TStrings & result_set,
304 	 TStrings & dataset,
305 	 TType const & l,
306 	 TType const & d,
307 	 bool const & is_exact,
308 	 Omops const & /*model_type*/)
309 {
310     SEQAN_CHECKPOINT;
311 
312 	typedef typename Value<TStrings>::Type TString;
313 	typedef typename Value<TString>::Type TValue;
314 	Shape<TValue> shape(l);
315 	// ----------------------------------------------------------------------------
316 	// STEP 1:
317 	// processing the first sequence.
318 	// ----------------------------------------------------------------------------
319 	typename Iterator<TStrings>::Type ds_iter = begin(dataset);
320 	TString l_mer;
321 	std::vector<int> var_of_l_mer, var_all;
322 
323 	typename Size<TString>::Type seq_len = length(*ds_iter);
324 	typename Iterator<TString>::Type seq_iter = begin(*ds_iter);
325 	typename Iterator<TString>::Type seq_end = begin(*ds_iter)+(seq_len-l+1);
326 	while(seq_iter!=seq_end)
327 	{
328 		createDVariants(var_of_l_mer, seq_iter, l, d, is_exact, shape);
329 		std::copy(var_of_l_mer.begin(), var_of_l_mer.end(), std::back_inserter(var_all));
330 		var_of_l_mer.clear();
331 		++seq_iter;
332 	}
333 	// sort var_all and filter d-variants that occur exactly once in the sequence
334 	std::sort(var_all.begin(), var_all.end());
335 	std::vector<int> var_unique;
336 	std::unique_copy(var_all.begin(), var_all.end(), std::back_inserter(var_unique));
337 	var_all.clear();
338 	// ----------------------------------------------------------------------------
339 	// STEP 2:
340 	// processing remaining input sequences
341 	// ----------------------------------------------------------------------------
342 	std::vector<int> var_unique2, result;
343 	++ds_iter;
344 	for(; !atEnd(ds_iter, dataset); goNext(ds_iter))
345 	{
346 		seq_len = length(*ds_iter);
347 		seq_iter = begin(*ds_iter);
348 		while(seq_iter!=begin(*ds_iter)+(seq_len-l+1))
349 		{
350 			createDVariants(var_of_l_mer, seq_iter, l, d, is_exact, shape);
351 			std::copy(var_of_l_mer.begin(), var_of_l_mer.end(), std::back_inserter(var_all));
352 			var_of_l_mer.clear();
353 			++seq_iter;
354 		}
355 		// sort var_all and eliminate duplicates
356 		std::sort(var_all.begin(), var_all.end());
357 		std::unique_copy(var_all.begin(), var_all.end(), std::back_inserter(var_unique2));
358 		var_all.clear();
359 
360 		//create an insert_iterator for int-vector result
361 		std::insert_iterator< std::vector<int> > res_ins(result, result.begin());
362 		std::set_intersection(var_unique.begin(), var_unique.end(),
363 							  var_unique2.begin(), var_unique2.end(), res_ins);
364 		var_unique2.clear();
365 		var_unique.clear();
366 		var_unique.resize(result.size());
367 		std::copy(result.begin(),result.end(),var_unique.begin());
368 		result.clear();
369 	}
370 	// ----------------------------------------------------------------------------
371 	// STEP 3:
372 	// insert relevant d-variants into result set
373 	// ----------------------------------------------------------------------------
374 	resize(result_set, var_unique.size());
375 	typename std::vector<int>::iterator vect_iter = var_unique.begin();
376 	typename std::vector<int>::iterator vect_end = var_unique.end();
377 	unsigned int variant_nr =0;
378 	while(vect_iter!=vect_end)
379 	{
380 		l_mer = inverseHash<TValue>(*vect_iter, ValueSize<TValue>::VALUE, l);
381 		result_set[variant_nr] = l_mer;
382 		++variant_nr;
383 		++vect_iter;
384 	}
385 }
386 
387 //////////////////////////////////////////////////////////////////////////////
388 //	Zoops model
389 //////////////////////////////////////////////////////////////////////////////
390 
391 template<typename TStrings, typename TType>
392 void
pms1(TStrings & result_set,TStrings & dataset,TType const & l,TType const & d,bool const & is_exact,Zoops const & model_type)393 pms1(TStrings & result_set,
394 	 TStrings & dataset,
395 	 TType const & l,
396 	 TType const & d,
397 	 bool const & is_exact,
398 	 Zoops const & model_type)
399 {
400     SEQAN_CHECKPOINT;
401 
402 	typedef typename Value<TStrings>::Type TString;
403 	typedef typename Value<TString>::Type TValue;
404 	Shape<TValue> shape(l);
405 
406 	// ----------------------------------------------------------------------------
407 	// STEP 1:
408 	// building d-variants for all l-mers from the input sequences and storing their
409 	// corresponding hash-values (values of variants which occur exactly once
410 	// in an input sequence) in an int-string 'var_ar'.
411 	// index_mat contains the variant's hash values for each sequence.
412 	// ----------------------------------------------------------------------------
413 	typename Size<TStrings>::Type t = length(dataset);
414 	typename Iterator<TStrings>::Type ds_iter = begin(dataset);
415 	TString l_mer;
416 	std::vector<int> var_of_l_mer, var_all, result_vect;
417 	for(; !atEnd(ds_iter, dataset); goNext(ds_iter))
418 	{
419 		typename Size<TString>::Type seq_len = length(*ds_iter);
420 		typename Iterator<TString>::Type seq_iter = begin(*ds_iter);
421 		typename Iterator<TString>::Type seq_end = begin(*ds_iter)+(seq_len-l+1);
422 		while(seq_iter!=seq_end)
423 		{
424 			createDVariants(var_of_l_mer, seq_iter, l, d, is_exact, shape);
425 			std::copy(var_of_l_mer.begin(), var_of_l_mer.end(), std::back_inserter(var_all));
426 			var_of_l_mer.clear();
427 			++seq_iter;
428 		}
429 		// sort var_all and filter d-variants that occur exactly once in the sequence
430 		std::sort(var_all.begin(), var_all.end());
431 		std::vector<int>::iterator vect_iter = var_all.begin();
432 		std::vector<int>::iterator vect_end = var_all.end();
433 		std::vector<int> var_unique;
434 		while(vect_iter!=vect_end)
435 		{
436 			std::vector<int>::iterator upper_iter =
437 				std::upper_bound(var_all.begin(),var_all.end(),*vect_iter);
438 			int count =
439 				std::count(vect_iter, upper_iter, *vect_iter);
440 			if(count==1)
441 			{
442 				var_unique.push_back(*vect_iter);
443 			}
444 			vect_iter+=count;
445 		}
446 		var_all.clear();
447 
448 		//create an insert_iterator for int-vector var_all
449 		std::insert_iterator< std::vector<int> > res_ins(var_all, var_all.begin());
450 		std::merge(result_vect.begin(), result_vect.end(),
451 			       var_unique.begin(), var_unique.end(), res_ins);
452 		var_unique.clear();
453 		result_vect.clear();
454 		result_vect.resize(var_all.size());
455 		std::copy(var_all.begin(),var_all.end(),result_vect.begin());
456 		var_all.clear();
457 	}
458 	// ----------------------------------------------------------------------------
459 	// STEP 2:
460 	// count votes of l-mers (d-variants) and insert relevant d-variants into result set
461 	// ----------------------------------------------------------------------------
462 	int lower_limit = (int) floor(t*(model_type.threshold)+0.5);
463 	// count votes of relevant l-mers (d-variants)
464 	std::vector<int>::iterator iter = result_vect.begin();
465 	std::vector<int>::iterator end_iter = result_vect.end();
466 	unsigned int num_of_results = 0;
467 	while(iter!=end_iter)
468 	{
469 		std::vector<int>::iterator upper_iter =
470 			std::upper_bound(result_vect.begin(),result_vect.end(),*iter);
471 		int count =
472 			std::count(iter, upper_iter, *iter);
473 		if( count>=lower_limit )
474 		{
475 			var_all.push_back(*iter);
476 			++num_of_results;
477 		}
478 		iter+=count;
479 	}
480 	result_vect.clear();
481 
482 	resize(result_set, num_of_results);
483 	num_of_results = 0;
484 	iter = var_all.begin();
485 	end_iter = var_all.end();
486 	while(iter!=end_iter)
487 	{
488 		l_mer = inverseHash<TValue>(*iter, ValueSize<TValue>::VALUE, l);
489 		result_set[num_of_results] = l_mer;
490 		++num_of_results;
491 		++iter;
492 	}
493 	var_all.clear();
494 }
495 
496 //////////////////////////////////////////////////////////////////////////////
497 //	Tcm model
498 //////////////////////////////////////////////////////////////////////////////
499 
500 template<typename TStrings, typename TType>
501 void
pms1(TStrings & result_set,TStrings & dataset,TType const & l,TType const & d,bool const & is_exact,Tcm const & model_type)502 pms1(TStrings & result_set,
503 	 TStrings & dataset,
504 	 TType const & l,
505 	 TType const & d,
506 	 bool const & is_exact,
507 	 Tcm const & model_type)
508 {
509     SEQAN_CHECKPOINT;
510 
511 	typedef typename Value<TStrings>::Type TString;
512 	typedef typename Value<TString>::Type TValue;
513 	Shape<TValue> shape(l);
514 	// ----------------------------------------------------------------------------
515 	// STEP 1:
516 	// building d-variants for all l-mers from the input sequences and storing their
517 	// corresponding hash-values (values of variants which occur exactly once
518 	// in an input sequence) in an int-string 'var_ar'.
519 	// index_mat contains the variant's hash values for each sequence.
520 	// ----------------------------------------------------------------------------
521 	typename Size<TStrings>::Type t = length(dataset);
522 	typename Iterator<TStrings>::Type ds_iter = begin(dataset);
523 	TString l_mer;
524 	std::vector<int> var_of_l_mer, var_all, result_vect, unique_vect;
525 	for(; !atEnd(ds_iter, dataset); goNext(ds_iter))
526 	{
527 		typename Size<TString>::Type seq_len = length(*ds_iter);
528 		typename Iterator<TString>::Type seq_iter = begin(*ds_iter);
529 		while(seq_iter!=begin(*ds_iter)+(seq_len-l+1))
530 		{
531 			createDVariants(var_of_l_mer, seq_iter, l, d, is_exact, shape);
532 			std::copy(var_of_l_mer.begin(), var_of_l_mer.end(), std::back_inserter(var_all));
533 			var_of_l_mer.clear();
534 			++seq_iter;
535 		}
536 
537 		// sort var_all and filter d-variants that occur exactly once in the sequence
538 		std::sort(var_all.begin(), var_all.end());
539 		std::insert_iterator< std::vector<int> > res_ins1(unique_vect, unique_vect.begin());
540 		std::unique_copy(var_all.begin(), var_all.end(), res_ins1);
541 		var_all.clear();
542 
543 		//create an insert_iterator for int-vector var_all
544 		std::insert_iterator< std::vector<int> > res_ins2(var_all, var_all.begin());
545 		std::merge(result_vect.begin(), result_vect.end(),
546 			       unique_vect.begin(), unique_vect.end(), res_ins2);
547 		unique_vect.clear();
548 		result_vect.clear();
549 		result_vect.resize(var_all.size());
550 		std::copy(var_all.begin(),var_all.end(),result_vect.begin());
551 		var_all.clear();
552 	}
553 	// ----------------------------------------------------------------------------
554 	// STEP 2:
555 	// count votes of l-mers (d-variants) and insert relevant d-variants into result set
556 	// ----------------------------------------------------------------------------
557 	int lower_limit = (int) floor(t*(model_type.threshold)+0.5);
558 
559 	// count votes of relevant l-mers (d-variants)
560 	std::vector<int>::iterator iter = result_vect.begin();
561 	std::vector<int>::iterator end_iter = result_vect.end();
562 	unsigned int num_of_results = 0;
563 	while(iter!=end_iter)
564 	{
565 		std::vector<int>::iterator upper_iter =
566 			std::upper_bound(result_vect.begin(),result_vect.end(),*iter);
567 		int count =
568 			std::count(iter, upper_iter, *iter);
569 		if( count>=lower_limit )
570 		{
571 			var_all.push_back(*iter);
572 			++num_of_results;
573 		}
574 		iter+=count;
575 	}
576 	result_vect.clear();
577 
578 	resize(result_set, num_of_results);
579 	num_of_results = 0;
580 	iter = var_all.begin();
581 	end_iter = var_all.end();
582 	while(iter!=end_iter)
583 	{
584 		l_mer = inverseHash<TValue>(*iter, ValueSize<TValue>::VALUE, l);
585 		result_set[num_of_results] = l_mer;
586 		++num_of_results;
587 		++iter;
588 	}
589 	var_all.clear();
590 }
591 
592 //////////////////////////////////////////////////////////////////////////////
593 //Subfunctions
594 //////////////////////////////////////////////////////////////////////////////
595 
596 /*
597 .Function.createDVariants:
598 ..summary:Creates the d-variants of a given l-mer and computes their hash-values which will be
599           finally stored in array 'variants'.
600 ..cat:Motif Search
601 ..signature:createDVariants(variants,l_mer_begin,l,d,is_exact,shape)
602 ..param.variants:The set of d-variants of a given l-mer.
603 ...type:String<int>
604 ...remarks:The string holds the hash-values of each pattern being in the d-vicinity of a given l-mer.
605 ..param.l_mer_begin:An iterator pointing to the beginning of a given l-mer pattern.
606 ...type:Concept.Iterator Iterator
607 ....remarks:Standard conform iterator
608 ...type:Shortcut.DnaIterator
609 ....remarks:Iterator for @Shortcut.DnaString@ (a string of @Spec.Dna@).
610 ....see:Shortcut.DnaIterator
611 ...type:Shortcut.PeptideIterator
612 ....remarks:Iterator for @Shortcut.Peptide@ (a string of @Spec.AminoAcid@).
613 ....see:Shortcut.PeptideIterator
614 ..param.l:The size of the motif.
615 ..param.d:The number of substitutions.
616 ..param.is_exact:The size of Hamming distance
617 ...type:$bool$
618 ..param.shape:The @Class.Shape@ object.
619 ...type:Class.Shape
620 ...signature:Shape<TValue, SimpleShape>
621 ...remarks:Is used for computing the hash-value of each variant.
622 ..remarks: #d-variants =
623                   - sum(i,0,d,binomialCoefficient(l,i)*(alp_size-1)^i), if is_exact=false
624                   - binomialCoefficient(l.d)*(alp_size-1)^d),           else
625 ..include:seqan/find_motif.h
626 */
627 
628 template<typename TIntVect, typename TIterString, typename TType, typename TShape>
629 void
createDVariants(TIntVect & variants,TIterString l_mer_begin,TType const & l,TType const & d,bool is_exact,TShape & shape)630 createDVariants(TIntVect & variants,
631 				TIterString l_mer_begin,
632 				TType const & l,
633 				TType const & d,
634 				bool is_exact,
635 				TShape & shape)
636 {
637     SEQAN_CHECKPOINT;
638 
639 	typedef String<int> TIntArray;
640 	String<TIntArray> bitsets;
641 	if(is_exact)
642 	{
643 		// get all variants of bitsets consisting of d zeros(pos of mismatches)
644 		// and (l-d) ones
645 		_getVariantsOfBitset(bitsets, l, d);
646 
647 		typename Iterator< String<TIntArray> >::Type bitsets_iter = begin(bitsets);
648 		typename Iterator< String<TIntArray> >::Type bitsets_end = end(bitsets);
649 		while(bitsets_iter!=bitsets_end)
650 		{
651 			TIntVect hash_val_of_variants;
652 			_buildVariants(hash_val_of_variants, l_mer_begin, d, *bitsets_iter, shape);
653 			std::copy(hash_val_of_variants.begin(), hash_val_of_variants.end(), std::back_inserter(variants));
654 			++bitsets_iter;
655 		}
656 		clear(bitsets);
657 	}
658 	else
659 	{
660 		for(unsigned int i=0; i<=d; ++i)
661 		{
662 			_getVariantsOfBitset(bitsets, l, i);
663 			typename Iterator< String<TIntArray> >::Type bitsets_iter = begin(bitsets);
664 			typename Iterator< String<TIntArray> >::Type bitsets_end = end(bitsets);
665 			while(bitsets_iter!=bitsets_end)
666 			{
667 				TIntVect hash_val_of_variants;
668 				_buildVariants(hash_val_of_variants, l_mer_begin, i, *bitsets_iter, shape);
669 				std::copy(hash_val_of_variants.begin(), hash_val_of_variants.end(), std::back_inserter(variants));
670 				++bitsets_iter;
671 			}
672 			clear(bitsets);
673 		}
674 	}
675 }
676 
677 //////////////////////////////////////////////////////////////////////////////
678 
679 /*
680 .Function._getVariantsOfBitset:
681 ..summary:Builds all variants of bitsets consisting of d zeros & (l-d) ones which
682           will be stored in an seqan::String.
683 ..cat:Motif Search
684 ..signature:_getVariantsOfBitset(bitsets,l,d)
685 ..param.bitsets:The collection of bitsets.
686 ...type:Class.String
687 ...signature:String< String<int> >
688 ...remarks:Bitsets contains different length-l arrays consisting of d zero- (position of mismatches)
689            and (l-d) one-values
690 ..param.l:The size of the motif.
691 ..param.d:The number of substitutions.
692 ..remarks: #bitsets = binomialCoefficient(l,d).
693 ..include:seqan/find_motif.h
694 */
695 
696 template<typename TStrings, typename TType>
697 void
_getVariantsOfBitset(TStrings & bitsets,TType const & l,TType const & d)698 _getVariantsOfBitset(TStrings & bitsets,
699 					 TType const & l,
700 					 TType const & d)
701 {
702     SEQAN_CHECKPOINT;
703 
704 	unsigned int num_of_bitsets = binomialCoefficient(l, d);
705 	resize(bitsets, num_of_bitsets);
706 
707 	typename Value<TStrings>::Type bitset;
708 	resize(bitset, l);
709 	std::fill(begin(bitset), end(bitset), 1);
710 	std::fill_n(begin(bitset), d, 0);
711 
712 	for(typename Position<TStrings>::Type i=0;i<num_of_bitsets; ++i)
713 	{
714 		bitsets[i] = bitset;
715 		std::next_permutation(begin(bitset), end(bitset));
716 	}
717 }
718 
719 //////////////////////////////////////////////////////////////////////////////
720 
721 /*
722 .Function._buildVariants:
723 ..summary:Builds all d-variants of a given l-mer given the position(s) of mismatch(es) and
724           then computes the hash-values for each variant which will be finally stored
725           in an int-vector variants.
726 ..cat:Motif Search
727 ..signature:_buildVariants(variants,l_mer_begin,d,bitset,shape)
728 ..param.variants:The int-vector of d-variants of a given l-mer
729 ...remarks:The vector holds the hash-values of each d-variant.
730 ...type:vector<int>
731 ..param.l_mer_begin:An iterator pointing to the beginning of a given l-mer pattern.
732 ...type:Concept.Iterator Iterator
733 ....remarks:Standard conform iterator
734 ...type:Shortcut.DnaIterator
735 ....remarks:Iterator for @Shortcut.DnaString@ (a string of @Spec.Dna@).
736 ....see:Shortcut.DnaIterator
737 ...type:Shortcut.PeptideIterator
738 ....remarks:Iterator for @Shortcut.Peptide@ (a string of @Spec.AminoAcid@).
739 ....see:Shortcut.PeptideIterator
740 ..param.d:The number of substitutions.
741 ..param.bitset:The bitset object.
742 ...type:Class.String
743 ...signature:String<int>
744 ...remarks:The length-l int-array consisting of d zeros (for mismatch positions)
745            & (l-d) ones)
746 ..param.shape:The @Class.Shape@ object.
747 ...type:Class.Shape
748 ...signature:Shape<TValue, SimpleShape>
749 ...remarks:Is used for computing the hash-value of each variant.
750 ..remarks: #variants = (alp_size-1)^d for a given bitset, d-value and l-mer.
751 ..include:seqan/find_motif.h
752 */
753 
754 template<typename TIntVect, typename TStringIter, typename TType, typename TBitset, typename TValue, typename TSpec>
755 void
_buildVariants(TIntVect & variants,TStringIter l_mer_begin,TType const & d,TBitset const & bitset,Shape<TValue,TSpec> & shape)756 _buildVariants(TIntVect & variants,
757 			   TStringIter l_mer_begin,
758 			   TType const & d,
759 			   TBitset const & bitset,
760 			   Shape<TValue, TSpec> & shape)
761 {
762     SEQAN_CHECKPOINT;
763 
764 	typedef String<TValue> TString;
765 	typedef typename Position<TString>::Type TPos;
766 
767 	//build l-mer
768 	TString l_mer;
769 	resize(l_mer, length(bitset));
770 	for(TPos i=0; i<length(l_mer); ++i)
771 	{
772 		l_mer[i] = *l_mer_begin;
773 		++l_mer_begin;
774 	}
775 
776 	// d-mers
777 	unsigned int num_of_d_mers =
778 		(unsigned int) pow((double)ValueSize<TValue>::VALUE, (int)d);
779 	for(unsigned int i=0; i<num_of_d_mers; ++i)
780 	{
781 		TString d_mer = inverseHash<TValue>(i, ValueSize<TValue>::VALUE, d);
782 		TString variant = l_mer;
783 
784 		typename Position<TString>::Type pos_of_d_mer = 0;
785 		typename Position<TString>::Type pos_of_variant = 0;
786 		typename Iterator<TString>::Type iter = begin(variant);
787 		typename Iterator<TString>::Type iter_end = end(variant);
788 		while(iter!=iter_end)
789 		{
790 			if( bitset[pos_of_variant]==0)
791 			{
792 				if(l_mer[pos_of_variant]!=d_mer[pos_of_d_mer])
793 				{
794 					variant[pos_of_variant] = d_mer[pos_of_d_mer];
795 					++pos_of_d_mer;
796 					++pos_of_variant;
797 					++iter;
798 				}
799 				else
800 				{
801 					iter = end(variant);
802 				}
803 			}
804 			else
805 			{
806 				++iter;
807 				++pos_of_variant;
808 			}
809 		}
810 		if(pos_of_d_mer==d)
811 		{
812 			variants.push_back(hash(shape, begin(variant)));
813 		}
814 	}
815 }
816 
817 //////////////////////////////////////////////////////////////////////////////
818 
819 }// namespace SEQAN_NAMESPACE_MAIN
820 
821 #endif //#ifndef SEQAN_HEADER_...
822 
823