1 // ==========================================================================
2 // SeqAn - The Library for Sequence Analysis
3 // ==========================================================================
4 // Copyright (c) 2006-2010, Knut Reinert, FU Berlin
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are met:
9 //
10 // * Redistributions of source code must retain the above copyright
11 // notice, this list of conditions and the following disclaimer.
12 // * Redistributions in binary form must reproduce the above copyright
13 // notice, this list of conditions and the following disclaimer in the
14 // documentation and/or other materials provided with the distribution.
15 // * Neither the name of Knut Reinert or the FU Berlin nor the names of
16 // its contributors may be used to endorse or promote products derived
17 // from this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29 // DAMAGE.
30 //
31 // ==========================================================================
32
33 #ifndef SEQAN_HEADER_FIND_MOTIF_PROJECTION_H
34 #define SEQAN_HEADER_FIND_MOTIF_PROJECTION_H
35
36 namespace SEQAN_NAMESPACE_MAIN
37 {
38
39 //////////////////////////////////////////////////////////////////////////////
40 // Projection
41 //////////////////////////////////////////////////////////////////////////////
42
43 /**
44 .Spec.Projection:
45 ..summary: Represents the PROJECTION algorithm of Buhler and Tompa.
46 ..general:Class.MotifFinder
47 ..cat:Motif Search
48 ..signature:MotifFinder<TValue, Projection>
49 ..param.TValue:The type of sequences to be analyzed.
50 ...type:Spec.Dna
51 ...type:Spec.AminoAcid
52 ..remarks:The @Spec.Projection|Projection algorithm@ is a heuristic algorithm that does not guarantee
53 that the unknown motif will be found every time. We can increase the chance of success
54 by performing a large number of independent trials to generate multiple guesses.
55 In each trial, @Spec.Projection@ makes a preselection of sets of length-l patterns called l-mers
56 which are likely to be a collection of motif instances (filtering step) and
57 refines them by some local searching techniques, e.g. @Function.em|EM algorithm@, Gibbs Sampling etc (refinement step).
58 ..include:seqan/find_motif.h
59 */
60
61 struct Projection_;
62 typedef Tag<Projection_> const Projection;
63
64 //////////////////////////////////////////////////////////////////////////////
65 // MotifFinder - Projection Spec
66 //
67 // t:=dataset size (number of sequences)
68 // l:=motif size
69 // m:=number of possible l-mers
70 // d:=number of substitutions
71 // k:=projection size
72 // s:=bucket size
73 // tr:=number of independent trials
74 //////////////////////////////////////////////////////////////////////////////
75
76 template <typename TValue>
77 class MotifFinder<TValue, Projection>
78 {
79 enum { ALPHABET_SIZE = ValueSize<TValue>::VALUE };
80 //____________________________________________________________________________________________
81 /*
82 enum { ALPHABET_SIZE = ValueSize<TValue>::VALUE };
83 typedef String<TValue> TString;
84 typedef String<TString> TStrings;
85 typedef typename Size<TStrings>::Type TSize1;
86 typedef typename Size<TString>::Type TSize2;*/
87
88 //____________________________________________________________________________________________
89
90 public:
91 typedef unsigned int TSize;
92 typedef String<TValue> TString;
93 typedef String<TString> TStrings;
94
95
96 TSize dataset_size;
97 TSize motif_size;
98 TSize total_num_of_l_mers;
99 TSize num_of_substitutions;
100 bool has_exact_substitutions;
101 TSize projection_size;
102 TSize bucket_threshold;
103 TSize num_of_trials;
104 TStrings set_of_motifs;
105 int score;
106
107 //____________________________________________________________________________________________
108
MotifFinder()109 MotifFinder():
110 dataset_size(0),
111 motif_size(0),
112 total_num_of_l_mers(0),
113 num_of_substitutions(0),
114 has_exact_substitutions(false),
115 projection_size(0),
116 bucket_threshold(0),
117 num_of_trials(0)
118 {
119 SEQAN_CHECKPOINT;
120 }
MotifFinder(TSize const & t_,TSize const & l_,TSize const & m_total_,TSize const & d_,bool const & is_exact_,TSize const & k_,TSize const & s_,TSize const & tr_)121 MotifFinder(TSize const & t_,
122 TSize const & l_,
123 TSize const & m_total_,
124 TSize const & d_,
125 bool const & is_exact_,
126 TSize const & k_,
127 TSize const & s_,
128 TSize const & tr_):
129 dataset_size(t_),
130 motif_size(l_),
131 total_num_of_l_mers(m_total_),
132 num_of_substitutions(d_),
133 has_exact_substitutions(is_exact_),
134 projection_size(k_),
135 bucket_threshold(s_),
136 num_of_trials(tr_)
137 {
138 SEQAN_CHECKPOINT;
139 }
MotifFinder(TSize const & t_,TSize const & l_,TSize const & m_total_,TSize const & d_,bool const & is_exact_)140 MotifFinder(TSize const & t_,
141 TSize const & l_,
142 TSize const & m_total_,
143 TSize const & d_,
144 bool const & is_exact_):
145 dataset_size(t_),
146 motif_size(l_),
147 total_num_of_l_mers(m_total_),
148 num_of_substitutions(d_),
149 has_exact_substitutions(is_exact_),
150 projection_size(0),
151 bucket_threshold(0),
152 num_of_trials(0)
153 {
154 SEQAN_CHECKPOINT;
155
156 projection_size =
157 _computeProjectionSize<TSize>(ALPHABET_SIZE,
158 motif_size,
159 num_of_substitutions,
160 total_num_of_l_mers);
161
162 bucket_threshold =
163 _computeBucketThreshold<TSize>(ALPHABET_SIZE,
164 motif_size,
165 num_of_substitutions,
166 total_num_of_l_mers,
167 projection_size);
168
169 double prob_q = static_cast<double>(0.95);
170 num_of_trials = _computeNumOfTrials(dataset_size,
171 motif_size,
172 num_of_substitutions,
173 projection_size,
174 bucket_threshold,
175 prob_q);
176 }
MotifFinder(MotifFinder const & other_)177 MotifFinder(MotifFinder const & other_):
178 dataset_size(other_.dataset_size),
179 motif_size(other_.motif_size),
180 total_num_of_l_mers(other_.total_num_of_l_mers),
181 num_of_substitutions(other_.num_of_substitutions),
182 has_exact_substitutions(other_.has_exact_substitutions),
183 projection_size(other_.projection_size),
184 bucket_threshold(other_.bucket_threshold),
185 num_of_trials(other_.num_of_trials)
186 {
187 SEQAN_CHECKPOINT;
188 }
~MotifFinder()189 ~MotifFinder()
190 {
191 SEQAN_CHECKPOINT;
192 }
193
194 MotifFinder const &
195 operator = (MotifFinder const & other_)
196 {
197 SEQAN_CHECKPOINT;
198 if(this!=&other_)
199 {
200 this->dataset_size = other_.dataset_size;
201 this->motif_size = other_.motif_size;
202 this->total_num_of_l_mers = other_.total_num_of_l_mers;
203 this->num_of_substitutions = other_.number_of_substitutions;
204 this->has_exact_substitutions = other_.has_exact_substitutions;
205 this->projection_size = other_.projection_size;
206 this->bucket_threshold = other_.bucket_threshold;
207 this->num_of_trials = other_.num_of_trials;
208 }
209
210 return *this;
211 }
212
213 //____________________________________________________________________________________________
214
215 }; // class MotifFinder<TValue, Projection>
216
217
218 //////////////////////////////////////////////////////////////////////////////
219 // Functions
220 //////////////////////////////////////////////////////////////////////////////
221
222 /*
223 .Function._computeProjectionSize:
224 ..summary:Computes the projection size (k).
225 ..cat:Motif Search
226 ..signature:_computeProjectionSize(alp_size,l,d,m)
227 ..param.alp_size:The size of the sequence alphabet.
228 ...remarks:The alp_size object is four for nucleotide sequences and twenty for amino acid sequences.
229 ..param.l:The size of the motif.
230 ..param.d:The number of substitutions.
231 ..param.m:The total number of possible l-mers of a given dataset.
232 ..include:seqan/find_motif.h
233 */
234
235 template<typename TType>
236 TType
_computeProjectionSize(TType const & alp_size,TType const & l,TType const & d,TType const & m_total)237 _computeProjectionSize(TType const & alp_size,
238 TType const & l,
239 TType const & d,
240 TType const & m_total)
241 {
242 SEQAN_CHECKPOINT;
243
244 TType result = static_cast<TType>(0);
245 double numerator = log(static_cast<double>(m_total));
246 double denominator = log(static_cast<double>(alp_size));
247 double fraction = (numerator/denominator);
248
249 if( fraction<static_cast<double>(l-d-1) )
250 {
251 result = static_cast<TType>(floor(fraction)+1);
252 }
253 else
254 {
255 result = l-d-1;
256 }
257
258 return result;
259 }
260
261 //////////////////////////////////////////////////////////////////////////////
262
263 /*
264 .Function._computeBucketThreshold:
265 ..summary:Computes the bucket threshold size (s).
266 ..cat:Motif Search
267 ..signature:_computeBucketThreshold(alp_size,l,d,m,k)
268 ..param.alp_size:The size of the sequence alphabet.
269 ...remarks:The alp_size object is four for nucleotide sequences and twenty for amino acid sequences.
270 ..param.l:The size of the motif.
271 ..param.d:The number of substitutions.
272 ..param.m:The total number of possible l-mers of a given dataset.
273 ..param.k:The projection size.
274 ..include:seqan/find_motif.h
275 */
276
277 template<typename TType>
278 TType
_computeBucketThreshold(TType const & alp_size,TType const & l,TType const & d,TType const & m_total,TType const & k)279 _computeBucketThreshold(TType const & alp_size,
280 TType const & l,
281 TType const & d,
282 TType const & m_total,
283 TType const & k)
284 {
285 SEQAN_CHECKPOINT;
286
287 TType result = 3; // or 4
288 double numerator = log(static_cast<double>(m_total));
289 double denominator = log(static_cast<double>(alp_size));
290 double fraction = (numerator/denominator);
291
292 if( fraction>=static_cast<double>(l-d-1) )
293 {
294 result =
295 static_cast<TType>(floor(static_cast<double>(m_total/
296 pow(static_cast<double>(alp_size), static_cast<double>(k))*2)));
297 if(result<1)
298 {
299 result = 1;
300 }
301 }
302
303 return result;
304 }
305
306 //////////////////////////////////////////////////////////////////////////////
307
308 /*
309 .Function._computeNumOfTrials:
310 ..summary:Computes the number of independent trials (tr).
311 ..cat:Motif Search
312 ..signature:_computeNumOfTrials(t,l,d,k,s,prob_q)
313 ..param.t:The number of input sequences.
314 ..param.l:The size of the motif.
315 ..param.d:The number of substitutions.
316 ..param.k:The projection size.
317 ..param.s:The bucket threshold size.
318 ..param.prob_q:
319 ...remarks:The prob_q object represents the probability that the planted bucket contains "s" or
320 more planted motif instances in at least one of the "tr" trials. Normally, we use
321 prob_q=0.95.
322 ...type:$double$
323 ..remarks:tr>= log(1-q)/log(B), where p is the probability that each motif occurence hashes
324 to the planted bucket and B is the probability that fewer than s planted occurences hash
325 to the planted buckes in a given trial
326 ..include:seqan/find_motif.h
327 */
328
329 template<typename TType>
330 TType
_computeNumOfTrials(TType const & t,TType const & l,TType const & d,TType const & k,TType const & s,double const & prob_q)331 _computeNumOfTrials(TType const & t,
332 TType const & l,
333 TType const & d,
334 TType const & k,
335 TType const & s,
336 double const & prob_q)
337 {
338 SEQAN_CHECKPOINT;
339
340 double prob_p =
341 static_cast<double>(binomialCoefficient( (l-d), k ))
342 /static_cast<double>(binomialCoefficient(l,k));
343
344 double prob_B = static_cast<double>(0);
345 for(unsigned int i=0; i<s; ++i)
346 {
347 prob_B+=
348 static_cast<double>(binomialCoefficient(t,i))
349 *pow(prob_p, static_cast<double>(i))
350 *pow(static_cast<double>(1)-prob_p, static_cast<double>(t-i));
351 }
352
353 double numerator = log(static_cast<double>(1)-prob_q);
354 double denominator = log(static_cast<double>(prob_B));
355 TType result =
356 static_cast<TType>(ceil(static_cast<double>(numerator/denominator)-static_cast<double>(0.5)));
357
358 if(result<1)
359 {
360 result = 1;
361 }
362
363 return result;
364 }
365
366 //////////////////////////////////////////////////////////////////////////////
367
368 /*
369 .Function.findMotif (for PROJECTION)
370 ..summary:Represents the PROJECTION algorithm.
371 ..cat:Motif Search
372 ..signature:findMotif(finder,dataset,seq_model)
373 ..param.finder:The @Class.MotifFinder@ object.
374 ...type:Class.MotifFinder
375 ..param.dataset:The dataset object representing the input sequences.
376 ...type:Class.StringSet
377 ..param.seq_model:The seq_model object.
378 ...type:Tag.Oops
379 ...type:Tag.Zoops
380 ...type:Tag.Tcm
381 ...remarks:The sequence models rely on different assumptions about the distribution of motif occurrences
382 across the sample sequences.
383 ..remarks:The PROJECTION algorithm which consists of two steps, the filtering and the refinement step,
384 is able to run in Oops, Zoops and Tcm mode.
385 ..remarks:The algorithm uses the EM procedure during the refinement phase which was introduced by Bailey
386 and Elkan.
387 ..include:seqan/find_motif.h
388 */
389
390 template<typename TSeqType, typename TStrings, typename TModel>
391 void
findMotif(MotifFinder<TSeqType,Projection> & finder,TStrings & dataset,TModel seq_model)392 findMotif(MotifFinder<TSeqType, Projection> & finder,
393 TStrings & dataset,
394 TModel seq_model)
395 {
396 SEQAN_CHECKPOINT;
397
398 typedef typename Value<TStrings>::Type TString;
399 typedef typename Position<TString>::Type TPos;
400 typedef String<int> TArray;
401 typedef std::vector<int> TBucket;
402 typedef typename Size<TArray>::Type TArSize;
403 typedef String<TBucket> TBuckets;
404
405 // dataset information
406 typedef typename Size<TStrings>::Type TStringsSize;
407 TStringsSize t = length(dataset);
408
409 // count_ar:=array of votes for each h(k-mer)
410 TArSize ar_size =
411 static_cast<TArSize>(pow(static_cast<double>(ValueSize<TSeqType>::VALUE), static_cast<int>(finder.projection_size)));
412 finder.score = -1;
413 //std::set< String<int> > occurred_positions;
414 for(unsigned int trial=0; trial<finder.num_of_trials; ++trial)
415 {
416 //
417 //std::cout << " . ";
418 //
419
420 TArray count_ar;
421 resize(count_ar, ar_size);
422
423 // ----------------------------------------------------------------------------
424 // STEP 1:
425 // filtering phase (:=key random projection phase)
426 // ----------------------------------------------------------------------------
427 // choose randomly k different positions
428 std::set<int> positions;
429 choosePositions(positions,finder.motif_size,finder.projection_size);
430
431 // array of collection of l-mers
432 TBuckets bucket_ar;
433 unsigned int num_of_relevant_buckets = 0;
434
435 _filteringStep(bucket_ar,
436 count_ar,
437 num_of_relevant_buckets,
438 dataset,
439 positions,
440 finder.motif_size,
441 finder.bucket_threshold);
442
443 // ----------------------------------------------------------------------------
444 // STEP 2:
445 // checking phase (:= local search-based refinement procedure)
446 // ----------------------------------------------------------------------------
447 TPos i = 0;
448 TPos j = 0;
449 while( (j<num_of_relevant_buckets) & (i<ar_size) )
450 {
451 unsigned int bucket_size = (bucket_ar[i]).size();
452 if(bucket_size>=finder.bucket_threshold)
453 {
454 ++j;
455
456 TStrings l_mers;
457 resize(l_mers, bucket_size);
458 TBucket::iterator bucket_iter, bucket_end;
459 bucket_iter = (bucket_ar[i]).begin();
460 bucket_end = (bucket_ar[i]).end();
461 int bucket_element = 0;
462 while(bucket_iter!=bucket_end)
463 {
464 TString l_mer =
465 inverseHash<TSeqType>(*bucket_iter, ValueSize<TSeqType>::VALUE, finder.motif_size);
466 l_mers[bucket_element] = l_mer;
467 ++bucket_element;
468 ++bucket_iter;
469 }
470
471 TString consensus_pat;
472 TStrings motif_instances;
473 int score =
474 _refinementStep(consensus_pat,
475 l_mers,dataset,
476 finder.motif_size,
477 finder.num_of_substitutions,
478 finder.has_exact_substitutions,
479 seq_model);
480
481 if(score>finder.score)
482 {
483 finder.score = score;
484 if (!length(finder.set_of_motifs)) appendValue(finder.set_of_motifs, consensus_pat);
485 else finder.set_of_motifs[0] = consensus_pat;
486
487 if((TStringsSize) finder.score==t)
488 {
489 trial = finder.num_of_trials;
490 j = num_of_relevant_buckets;
491 }
492 }
493 }
494 ++i;
495 }// end while( (j<num_of_relevant_buckets) & (i<ar_size) )
496 }// end for(unsigned int trial=0; trial<finder.num_of_trials; ++trial)
497
498 //
499 //std::cout << "\n";
500 //
501 }
502
503 //////////////////////////////////////////////////////////////////////////////
504
505 /*
506 .Function._filteringStep:
507 ..summary:Given a position set with k different positions we compute a projection value
508 for each l-mer in the input sequences and store the specific l-mer in the appropriate
509 bucket which is labeled with the specific projection value.
510 ..cat:Motif Search
511 ..signature:_filteringStep(buckets,count_ar,num_of_relevant_buckets,dataset,shape,l,s)
512 ..include:seqan/find_motif.h
513 */
514
515 template<typename TBucketAr, typename TArray, typename TType, typename TStrings, typename TPositions>
516 void
_filteringStep(TBucketAr & buckets,TArray & count_ar,TType & num_of_relevant_buckets,TStrings & dataset,TPositions & positions,TType const & l,TType const & s)517 _filteringStep(TBucketAr & buckets,
518 TArray & count_ar,
519 TType & num_of_relevant_buckets,
520 TStrings & dataset,
521 TPositions & positions,
522 TType const & l,
523 TType const & s)
524 {
525 SEQAN_CHECKPOINT;
526
527 typedef typename Value<TStrings>::Type TString;
528 typedef typename Value<TString>::Type TValue;
529 typedef typename Value<TBucketAr>::Type TBucket;
530 typename Iterator<TStrings>::Type ds_iter = begin(dataset);
531 typename Size<TArray>::Type ar_size = length(count_ar);
532 Shape<TValue> shape(l); //to compute hash value of l-mer x
533
534 // initialize pointer by setting it to null
535 // (=std::fill(begin(count_ar),end(count_ar),0))
536 typename Iterator<TArray>::Type count_ar_iter = begin(count_ar) ;
537 typename Iterator<TArray>::Type count_ar_end = end(count_ar);
538 while(count_ar_iter!=count_ar_end)
539 {
540 *count_ar_iter = 0;
541 ++count_ar_iter;
542 }
543
544 // go over input sequences & increment corresponding counter in count_ar
545 // fill l_mer_index with entries
546 resize(buckets, ar_size);
547 int y = 0; //hash-value of created k-mer
548 int x = 0; //hash-value of l-mer
549 for(; !atEnd(ds_iter, dataset); goNext(ds_iter))
550 {
551 typename Size<TString>::Type seq_length = length(*ds_iter);
552 typename Iterator<TString>::Type seq_iter = begin(*ds_iter);
553 typename Iterator<TString>::Type seq_end = begin(*ds_iter)+(seq_length-l+1);
554 while( seq_iter!=seq_end )
555 {
556 x = hash(shape, seq_iter);
557 y = projectLMer<TValue>(positions, seq_iter);
558 ++count_ar[y];
559 (buckets[y]).push_back(x);
560 ++seq_iter;
561 }
562 }
563
564 num_of_relevant_buckets =
565 std::count_if(begin(count_ar),
566 end(count_ar),
567 bind2nd(std::greater_equal<int>(),static_cast<int>(s)));
568 }
569
570 //////////////////////////////////////////////////////////////////////////////
571
572 /*
573 .Function._refinementStep:
574 ..summary:Refines the collection of l-mers in each relevant bucket which contains at least s l-mers.
575 ..cat:Motif Search
576 ..signature:_refinementStep(consensus_seq,positions,l_mers,dataset,t,l,d,is_exact,model_type)
577 ..include:seqan/find_motif.h
578 */
579
580 //////////////////////////////////////////////////////////////////////////////
581 // Oops model
582 //////////////////////////////////////////////////////////////////////////////
583
584 template<typename TString, typename TType>
585 int
_refinementStep(TString & consensus_seq,String<TString> const & l_mers,String<TString> & dataset,TType const & l,TType const & d,bool const & is_exact,Oops const & oops)586 _refinementStep(TString & consensus_seq,
587 String<TString> const & l_mers,
588 String<TString> & dataset,
589 TType const & l,
590 TType const & d,
591 bool const & is_exact,
592 Oops const & oops)
593 {
594 SEQAN_CHECKPOINT;
595
596 typedef String<TString> TStrings;
597 typedef typename Value<TString>::Type TValue;
598 typedef typename Position<TString>::Type TPos;
599 typedef FrequencyDistribution<TValue> TFrequencyDistribution;
600 TType t = length(dataset);
601 int score = 0;
602
603 // compute background frequency
604 TFrequencyDistribution background;
605 backgroundFrequency(background, begin(dataset), end(dataset));
606
607 // step1: initial guess (profile) from bucket
608 double epsilon = 0.1;
609 Pseudocount<TValue, CMode> pseudocount_mode_c(epsilon);
610 String<TFrequencyDistribution> profile;
611 convertSetOfPatternsToProfile(profile, l_mers, pseudocount_mode_c);
612 completeProfile(profile, background);
613
614 // step2: refinement of initial profile with em: 5 trials
615 double likelihood_score = 0;
616 int iterations = 3; //5
617
618 while(iterations>0)
619 {
620 likelihood_score = em(profile, begin(dataset), t, l, oops);
621 --iterations;
622 }
623
624 determineConsensusSeq(consensus_seq, profile, l);
625 typename Iterator<TStrings>::Type ds_iter, ds_end;
626 ds_iter = begin(dataset);
627 ds_end = end(dataset);
628 typename Position<TStrings>::Type seq_nr;
629 do
630 {
631 seq_nr = t-(ds_end-ds_iter);
632 TPos m = (TPos)(length(*ds_iter)-l+1);
633 int * hd_ar = new int[m];
634 typename Iterator<TString>::Type seq_iter, seq_end, consensus_begin;
635 seq_iter = begin(*ds_iter);
636 seq_end = seq_iter+m;
637 while(seq_iter!=seq_end)
638 {
639 consensus_begin = begin(consensus_seq);
640 hd_ar[m-(seq_end-seq_iter)] = hammingDistance<int>(seq_iter, seq_iter+l, consensus_begin);
641 ++seq_iter;
642 }
643
644 if( (!is_exact) &
645 (count_if(hd_ar,hd_ar+m,bind2nd(std::less_equal<TType>(), d))==1) )
646 {
647 ++score;
648 }
649 else if( is_exact &
650 (count_if(hd_ar,hd_ar+m,bind2nd(std::equal_to<TType>(), d))==1) )
651 {
652 ++score;
653 }
654
655 delete[] hd_ar;
656 ++ds_iter;
657 }
658 while( (ds_iter!=ds_end) & (score==(int) seq_nr+1) );
659
660 return score;
661 }
662
663 //////////////////////////////////////////////////////////////////////////////
664 // Omops model
665 //////////////////////////////////////////////////////////////////////////////
666
667 template<typename TString, typename TType>
668 int
_refinementStep(TString & consensus_seq,String<TString> const & l_mers,String<TString> & dataset,TType const & l,TType const & d,bool const & is_exact,Omops const &)669 _refinementStep(TString & consensus_seq,
670 String<TString> const & l_mers,
671 String<TString> & dataset,
672 TType const & l,
673 TType const & d,
674 bool const & is_exact,
675 Omops const & /*omops*/)
676 {
677 typedef String<TString> TStrings;
678 typedef typename Value<TString>::Type TValue;
679 typedef typename Position<TString>::Type TPos;
680 typedef FrequencyDistribution<TValue> TFrequencyDistribution;
681 TType t = length(dataset);
682 int score = 0;
683
684 // compute background frequency
685 TFrequencyDistribution background;
686 backgroundFrequency(background, begin(dataset), end(dataset));
687
688 // step1: initial guess (profile) from bucket
689 double epsilon = 0.1;
690 Pseudocount<TValue, CMode> pseudocount_mode_c(epsilon);
691 String<TFrequencyDistribution> profile;
692 convertSetOfPatternsToProfile(profile, l_mers, pseudocount_mode_c);
693 completeProfile(profile, background);
694
695 // step2: refinement of initial profile with em: 5 trials
696 double likelihood_score = 0;
697 int iterations = 3; //5
698
699 while(iterations>0)
700 {
701 likelihood_score = em(profile, begin(dataset), t, l, Oops());
702 --iterations;
703 }
704
705 determineConsensusSeq(consensus_seq, profile, l);
706 typename Iterator<TStrings>::Type ds_iter, ds_end;
707 ds_iter = begin(dataset);
708 ds_end = end(dataset);
709 typename Position<TStrings>::Type seq_nr;
710 do
711 {
712 seq_nr = t-(ds_end-ds_iter);
713 TPos m = (TPos)(length(*ds_iter)-l+1);
714 typename Iterator<TString>::Type seq_iter, seq_end, consensus_begin;
715 seq_iter = begin(*ds_iter);
716 seq_end = seq_iter+m;
717 while(seq_iter!=seq_end)
718 {
719 consensus_begin = begin(consensus_seq);
720 TType hd = hammingDistance<TType>(seq_iter, seq_iter+l, consensus_begin);
721
722 if( (!is_exact) & (hd<=d) )
723 {
724 ++score;
725 seq_iter = seq_end-1;
726 }
727 else if( is_exact & (hd==d) )
728 {
729 ++score;
730 seq_iter = seq_end-1;
731 }
732 ++seq_iter;
733 }
734 ++ds_iter;
735 }
736 while( (ds_iter!=ds_end) & (score== (int) seq_nr+1) );
737
738 return score;
739 }
740
741 //////////////////////////////////////////////////////////////////////////////
742 // Zoops model
743 //////////////////////////////////////////////////////////////////////////////
744
745 template<typename TString, typename TType>
746 int
_refinementStep(TString & consensus_seq,String<TString> const & l_mers,String<TString> & dataset,TType const & l,TType const & d,bool const & is_exact,Zoops const & zoops)747 _refinementStep(TString & consensus_seq,
748 String<TString> const & l_mers,
749 String<TString> & dataset,
750 TType const & l,
751 TType const & d,
752 bool const & is_exact,
753 Zoops const & zoops)
754 {
755 SEQAN_CHECKPOINT;
756
757 typedef String<TString> TStrings;
758 typedef typename Value<TString>::Type TValue;
759 typedef typename Position<TString>::Type TPos;
760 typedef FrequencyDistribution<TValue> TFrequencyDistribution;
761 TType t = length(dataset);
762 int score = 0;
763 int lower_limit = (int) floor(t*(zoops.threshold)+0.5);
764
765 // compute background frequency
766 TFrequencyDistribution background;
767 backgroundFrequency(background, begin(dataset), end(dataset));
768
769 // step1: initial guess (profile) from bucket
770 double epsilon = 0.1;
771 Pseudocount<TValue, CMode> pseudocount_mode_c(epsilon);
772 String<TFrequencyDistribution> profile;
773 convertSetOfPatternsToProfile(profile, l_mers, pseudocount_mode_c);
774 completeProfile(profile, background);
775
776 // step2: refinement of initial profile with em: 5 trials
777 double likelihood_score = 0;
778 double gamma = static_cast<double>(1)/sqrt(static_cast<double>(t));
779 int iterations = 3; //5
780 while(iterations>0)
781 {
782 likelihood_score = em(profile, begin(dataset), t, l, gamma, zoops);
783 --iterations;
784 }
785
786 determineConsensusSeq(consensus_seq, profile, l);
787 typename Iterator<TStrings>::Type ds_iter, ds_end;
788 ds_iter = begin(dataset);
789 ds_end = end(dataset);
790 typename Position<TStrings>::Type seq_nr;
791 do
792 {
793 seq_nr = t-(ds_end-ds_iter);
794 TPos m = (TPos)(length(*ds_iter)-l+1);
795 int * hd_ar = new int[m];
796 typename Iterator<TString>::Type seq_iter, seq_end, consensus_begin;
797 seq_iter = begin(*ds_iter);
798 seq_end = seq_iter+m;
799 while(seq_iter!=seq_end)
800 {
801 consensus_begin = begin(consensus_seq);
802 hd_ar[m-(seq_end-seq_iter)] = hammingDistance<int>(seq_iter, seq_iter+l, consensus_begin);
803 ++seq_iter;
804 }
805
806 if(!is_exact)
807 {
808 TType num = count_if(hd_ar,hd_ar+m,bind2nd(std::less_equal<TType>(), d));
809 if(num>1)
810 {
811 ds_iter = ds_end-1;
812 }
813 else if(num==1)
814 {
815 ++score;
816 }
817 }
818 else
819 {
820 TType num = count_if(hd_ar,hd_ar+m,bind2nd(std::equal_to<TType>(), d));
821 if(num>1)
822 {
823 ds_iter = ds_end-1;
824 }
825 else if(num==1)
826 {
827 ++score;
828 }
829 }
830 delete[] hd_ar;
831 ++ds_iter;
832 }
833 while(ds_iter!=ds_end);
834
835 if(score<lower_limit)
836 {
837 score = 0;
838 }
839
840 return score;
841 }
842
843 //////////////////////////////////////////////////////////////////////////////
844 // Tcm model
845 //////////////////////////////////////////////////////////////////////////////
846
847 template<typename TString, typename TType>
848 int
_refinementStep(TString & consensus_seq,String<TString> const & l_mers,String<TString> & dataset,TType const & l,TType const & d,bool const & is_exact,Tcm const & tcm)849 _refinementStep(TString & consensus_seq,
850 String<TString> const & l_mers,
851 String<TString> & dataset,
852 TType const & l,
853 TType const & d,
854 bool const & is_exact,
855 Tcm const & tcm)
856 {
857 SEQAN_CHECKPOINT;
858
859 typedef String<TString> TStrings;
860 typedef typename Value<TString>::Type TValue;
861 typedef typename Position<TString>::Type TPos;
862 typedef FrequencyDistribution<TValue> TFrequencyDistribution;
863 TType t = length(dataset);
864 int score = 0;
865 int lower_limit = (int) floor(t*(tcm.threshold)+0.5);
866
867 // compute background frequency
868 TFrequencyDistribution background;
869 backgroundFrequency(background, begin(dataset), end(dataset));
870
871 // step1: initial guess (profile) from bucket
872 double epsilon = 0.1;
873 Pseudocount<TValue, CMode> pseudocount_mode_c(epsilon);
874 String<TFrequencyDistribution> profile;
875 convertSetOfPatternsToProfile(profile, l_mers, pseudocount_mode_c);
876 completeProfile(profile, background);
877
878 // step2: refinement of initial profile with em: 5 trials
879 double gamma = static_cast<double>(1)/sqrt(static_cast<double>(t));
880 double lambda = 0;
881 typename Iterator<TStrings>::Type ds_iter, ds_end;
882 ds_iter = begin(dataset);
883 ds_end = end(dataset);
884 while(ds_iter!=ds_end)
885 {
886 TPos m = (TPos)(length(*ds_iter)-l+1);
887 lambda += (gamma/((double)m));
888 ++ds_iter;
889 }
890 lambda = lambda/((double)t);
891
892 double likelihood_score = 0;
893 int iterations = 3; //3
894 while(iterations>0)
895 {
896 likelihood_score = em(profile, begin(dataset), t, l, lambda, tcm);
897 --iterations;
898 }
899
900 determineConsensusSeq(consensus_seq, profile, l);
901 ds_iter = begin(dataset);
902 typename Position<TStrings>::Type seq_nr;
903 do
904 {
905 seq_nr = t-(ds_end-ds_iter);
906 TPos m = (TPos)(length(*ds_iter)-l+1);
907 typename Iterator<TString>::Type seq_iter, seq_end, consensus_begin;
908 seq_iter = begin(*ds_iter);
909 seq_end = seq_iter+m;
910 while(seq_iter!=seq_end)
911 {
912 consensus_begin = begin(consensus_seq);
913 TType hd = hammingDistance<TType>(seq_iter, seq_iter+l, consensus_begin);
914
915 if( (!is_exact) & (hd<=d) )
916 {
917 ++score;
918 seq_iter = seq_end-1;
919 }
920 else if( is_exact & (hd==d) )
921 {
922 ++score;
923 seq_iter = seq_end-1;
924 }
925 ++seq_iter;
926 }
927 ++ds_iter;
928 }
929 while(ds_iter!=ds_end);
930
931 if(score<lower_limit)
932 {
933 score = 0;
934 }
935
936 return score;
937 }
938
939 //////////////////////////////////////////////////////////////////////////////
940 //Subfunctions
941 //////////////////////////////////////////////////////////////////////////////
942
943 /*
944 .Function.choosePositions:
945 ..summary:Chooses randomly k different positions from {0,1,...,(l-1)}
946 ..cat:Motif Search
947 ..signature:choosePositions(positions,l,k)
948 ..param.positions:The set of k chosen positions.
949 ...type:$set<int>$
950 ..param.l:The size of the motif.
951 ..param.k:The projection size.
952 ..include:seqan/find_motif.h
953 */
954
955 template<typename TAssociativeContainer, typename TType>
956 void
choosePositions(TAssociativeContainer & positions,TType const & l,TType const & k)957 choosePositions(TAssociativeContainer & positions, TType const & l, TType const & k)
958 {
959 SEQAN_CHECKPOINT;
960 while(positions.size()<k)
961 {
962 int position = rand() % l;
963 positions.insert(position);
964 }
965 }
966
967 //////////////////////////////////////////////////////////////////////////////
968
969 /*
970 .Function.projectLMer:
971 ..summary:Based on set "positions" the function uses the letters of a given l-mer at
972 chosen k positions to compute an appropriate hash value of the new k-mer.
973 ..cat:Motif Search
974 ..signature:projectLMer(positions,l,k)
975 ..param.positions:The set of k chosen positions.
976 ...remarks:$positions$ is of type $set<int>$
977 ..param.k:An iterator pointing to the first positions of a given sequence.
978 ..include:seqan/find_motif.h
979 */
980
981 template<typename TValue, typename TIter>
982 inline std::set<int>::value_type
projectLMer(std::set<int> & positions,TIter it)983 projectLMer(std::set<int> & positions, TIter it)
984 {
985 SEQAN_CHECKPOINT;
986
987 typedef std::set<int>::value_type THValue;
988 THValue prev_pos; //, cur_pos;
989
990 std::set<int>::iterator positions_iter, positions_end;
991 positions_iter = positions.begin();
992 positions_end = positions.end();
993 prev_pos = *positions_iter;
994 it += prev_pos;
995 THValue hValue = ordValue(*it);
996 ++positions_iter;
997 while(positions_iter!=positions_end)
998 {
999 THValue cur_pos = *positions_iter;
1000 goFurther(it, (cur_pos-prev_pos));
1001 hValue = hValue * ValueSize<TValue>::VALUE + ordValue(*it);
1002 prev_pos = cur_pos;
1003 ++positions_iter;
1004 }
1005
1006 return hValue;
1007 }
1008
1009
1010 //////////////////////////////////////////////////////////////////////////////
1011
1012 /*
1013 .Function._getLMersWithTheLargestLikelihoodRatio:
1014 ..summary:Forms a guess for the planted motif by selecting from each input sequence
1015 the l-mer x with the largest likelihood ratio.
1016 ..signature:_getLMersWithTheLargestLikelihoodRatio(l_mers,positions,dataset_start,dataset_end,profile,l)
1017 ..param.l_mers:The collection of t l-mers.
1018 ..param.dataset_start:An iterator pointing to the first input sequence of a given dataset.
1019 ..param.dataset_end:An iterator pointing to the last input sequence of a given dataset.
1020 ..param.t:The number of input sequences.
1021 ..param.profile:The profile object which is a set of frequency distributions.
1022 ...type:Class.String
1023 ....signature:String<TFrequencyDistribution>
1024 ..param.l:The size of the motif.
1025 ..include:seqan/find_motif.h
1026 */
1027
1028 template<typename TStrings, typename TIter, typename TType, typename TProfile>
1029 void
_getLMersWithTheLargestLikelihoodRatio(TStrings & l_mers,TIter dataset_start,TIter dataset_end,TProfile const & profile,TType const & l)1030 _getLMersWithTheLargestLikelihoodRatio(TStrings & l_mers,
1031 TIter dataset_start,
1032 TIter dataset_end,
1033 TProfile const & profile,
1034 TType const & l)
1035 {
1036 SEQAN_CHECKPOINT;
1037
1038 typedef typename Value<TStrings>::Type TString;
1039 typedef typename Position<TStrings>::Type TPos1;
1040 typedef typename Position<TString>::Type TPos2;
1041 typename Size<TStrings>::Type t = (dataset_end-dataset_start);
1042 resize(l_mers, t);
1043 while(dataset_start!=dataset_end)
1044 {
1045 typename Position<TStrings>::Type seq_nr = t-(dataset_end-dataset_start);
1046 TPos2 m = (TPos2)(length(*dataset_start)-l+1);
1047 double * likelihood_ratio_mat = new double[m];
1048 typename Iterator<TString>::Type seq_iter, seq_end;
1049 seq_iter = begin(*dataset_start);
1050 seq_end = seq_iter+m;
1051 while(seq_iter!=seq_end)
1052 {
1053 likelihood_ratio_mat[m-(seq_end-seq_iter)] =
1054 _computeLikelihoodRatioOfLMer(seq_iter, seq_iter+l, profile);
1055 ++seq_iter;
1056 }
1057
1058 TPos2 max_pos =
1059 (std::max_element(likelihood_ratio_mat, likelihood_ratio_mat+m)-likelihood_ratio_mat);
1060 l_mers[seq_nr] = infix(*dataset_start, max_pos, max_pos+l);
1061
1062 delete[] likelihood_ratio_mat;
1063 ++dataset_start;
1064 }
1065 }
1066
1067 //////////////////////////////////////////////////////////////////////////////
1068
1069 /*
1070 .Function._computeLikelihoodRatioOfLMer:
1071 ..summary:Computes the likelihood ratio of a given l-mer.
1072 ..signature:_computeLikelihoodRatioOfLMer(l_mer_begin,l_mer_end,profile)
1073 ..param.l_mer_begin:An iterator pointing to the beginning of a given l-mer pattern.
1074 ...type:Concept.Iterator Iterator
1075 ....remarks:Standard conform iterator
1076 ...type:Shortcut.DnaIterator
1077 ....remarks:Iterator for @Shortcut.DnaString@ (a string of @Spec.Dna@).
1078 ....see:Shortcut.DnaIterator
1079 ...type:Shortcut.PeptideIterator
1080 ....remarks:Iterator for @Shortcut.Peptide@ (a string of @Spec.AminoAcid@).
1081 ....see:Shortcut.PeptideIterator
1082 ..param.l_mer_end:An iterator pointing to the end of a given l-mer pattern.
1083 ...type:Concept.Iterator Iterator
1084 ....remarks:Standard conform iterator
1085 ...type:Shortcut.DnaIterator
1086 ....remarks:Iterator for @Shortcut.DnaString@ (a string of @Spec.Dna@).
1087 ....see:Shortcut.DnaIterator
1088 ...type:Shortcut.PeptideIterator
1089 ....remarks:Iterator for @Shortcut.Peptide@ (a string of @Spec.AminoAcid@).
1090 ....see:Shortcut.PeptideIterator
1091 ..param.profile:The profile object which is a set of frequency distributions.
1092 ...type:Class.String
1093 ....signature:String<TFrequencyDistribution>
1094 ..remarks:Computes the sum of log probabilites instead of the product of probabilites
1095 ..include:seqan/find_motif.h
1096 */
1097
1098 template<typename TStrIter, typename TProfile>
1099 double
_computeLikelihoodRatioOfLMer(TStrIter l_mer_begin,TStrIter l_mer_end,TProfile const & profile)1100 _computeLikelihoodRatioOfLMer(TStrIter l_mer_begin,
1101 TStrIter l_mer_end,
1102 TProfile const & profile)
1103 {
1104 SEQAN_CHECKPOINT;
1105
1106 double result = 0;
1107 unsigned int l = (unsigned int)(l_mer_end-l_mer_begin);
1108 typedef typename Position<TProfile>::Type TPos;
1109 TProfile log_profile = profile;
1110 for(TPos i=0; i<length(log_profile); ++i)
1111 {
1112 logarithmize(log_profile[i]);
1113 }
1114
1115 double motif_component = 0;
1116 double backgr_component = 0;
1117 while(l_mer_begin!=l_mer_end)
1118 {
1119 motif_component += log_profile[l-(l_mer_end-l_mer_begin)+1][(int)*l_mer_begin];
1120 backgr_component += log_profile[0][(int)*l_mer_begin];
1121 ++l_mer_begin;
1122 }
1123 result = motif_component-backgr_component;
1124
1125 return exp(result);
1126 }
1127
1128 //////////////////////////////////////////////////////////////////////////////
1129
1130 /*
1131 .Function._computeLikelihoodRatioOfLMers:
1132 ..summary:Computes the likelihood ratio of a given set of l-mers.
1133 ..signature:_computeLikelihoodRatioOfLMers(l_mers,profile)
1134 ..param.l_mers:The collection of l-mers.
1135 ..param.profile:The profile object which is a set of frequency distributions.
1136 ...type:Class.String
1137 ....signature:String<TFrequencyDistribution>
1138 ..include:seqan/find_motif.h
1139 */
1140
1141 template<typename TStrings, typename TProfile>
1142 double
_computeLikelihoodRatioOfLMers(TStrings const & l_mers,TProfile const & profile)1143 _computeLikelihoodRatioOfLMers(TStrings const & l_mers,
1144 TProfile const & profile)
1145 {
1146 SEQAN_CHECKPOINT;
1147
1148 typedef typename Position<TStrings>::Type TPos;
1149 double score = 1;
1150 typename Size<TStrings>::Type num_of_l_mers = length(l_mers);
1151 for(TPos i=0; i<num_of_l_mers; ++i)
1152 {
1153 score *= _computeLikelihoodRatioOfLMer(begin(l_mers[i]), end(l_mers[i]), profile);
1154 }
1155
1156 return score;
1157 }
1158
1159 //////////////////////////////////////////////////////////////////////////////
1160
1161 /**
1162 .Function.determineConsensusSeq:
1163 ..summary:Determines the consensus pattern of a given profile.
1164 ..cat:Motif Search
1165 ..signature:determineConsensusSeq(consensus_seq,profile,l)
1166 ..param.consensus_seq:The consensus pattern.
1167 ...type:Class.String
1168 ...type:Shortcut.DnaString
1169 ...type:Shortcut.Peptide
1170 ..param.profile:A StringSet of @Class.FrequencyDistribution|frequency distributions@.
1171 ...type:Class.StringSet
1172 ..param.l:The size of the motif.
1173 ..include:seqan/find_motif.h
1174 */
1175
1176 template<typename TString, typename TProfile>
1177 void
determineConsensusSeq(TString & consensus_seq,TProfile & profile,typename Size<TString>::Type const & l)1178 determineConsensusSeq(TString & consensus_seq,
1179 TProfile & profile,
1180 typename Size<TString>::Type const & l)
1181 {
1182 SEQAN_CHECKPOINT;
1183
1184 typedef typename Value<TString>::Type TValue;
1185 typename Position<TString>::Type i;
1186
1187 resize(consensus_seq, l);
1188 if(length(profile)==l) //profile only consists of the motif component
1189 {
1190 for(i=0; i<l; ++i)
1191 {
1192 consensus_seq[i] =
1193 static_cast<TValue>(posOfMax(profile[i]));
1194 }
1195 }
1196 else
1197 {
1198 for(i=1; i<=l; ++i)
1199 {
1200 consensus_seq[i-1] =
1201 static_cast<TValue>(posOfMax(profile[i]));
1202 }
1203 }
1204 }
1205
1206 //////////////////////////////////////////////////////////////////////////////
1207
1208 /*
1209 .Function.displayResult:
1210 ..summary:Displays the consensus pattern of the found motif candidate.
1211 ..cat:Motif Search
1212 ..signature:displayResult(motif_finder)
1213 ..param.motif_finder:The @Class.MotifFinder@ object.
1214 ...type:Class.MotifFinder
1215 ..param.dataset:The dataset object representing the input sequences.
1216 ...type:Class.String
1217 ...signature:String<TString>
1218 ...param.TString:A @Class.String@ type
1219 ....type:Class.String
1220 ..include:seqan/find_motif.h
1221 */
1222
1223 template<typename TValue>
1224 void
displayResult(MotifFinder<TValue,Projection> & projection)1225 displayResult(MotifFinder<TValue, Projection> & projection)
1226 {
1227 SEQAN_CHECKPOINT;
1228
1229 if(length(projection.set_of_motifs)!=0)
1230 {
1231 std::cout << projection.set_of_motifs[0] << "\n";
1232 }
1233 else
1234 {
1235 std::cout << "NO MOTIF HAS BEEN FOUND!!!\n";
1236 }
1237 }
1238
1239 //////////////////////////////////////////////////////////////////////////////////////////////
1240 // Access Functions
1241 //////////////////////////////////////////////////////////////////////////////////////////////
1242
1243 template <typename TValue>
1244 inline int
getScore(MotifFinder<TValue,Projection> & me)1245 getScore(MotifFinder<TValue, Projection> & me)
1246 {
1247 SEQAN_CHECKPOINT;
1248 return me.score;
1249 }
1250
1251
1252 //////////////////////////////////////////////////////////////////////////////////////////////
1253
1254 }// namespace SEQAN_NAMESPACE_MAIN
1255
1256 #endif //#ifndef SEQAN_HEADER_...
1257