1 /*  $Id: seq_masker.cpp 464810 2015-04-14 16:29:42Z vakatov $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author:  Aleksandr Morgulis
27  *
28  * File Description:
29  *   CSeqMasker class member and method definitions.
30  *
31  */
32 
33 #include <ncbi_pch.hpp>
34 
35 #include <corelib/ncbi_limits.h>
36 
37 #include <algo/winmask/seq_masker_window.hpp>
38 #include <algo/winmask/seq_masker_window_ambig.hpp>
39 #include <algo/winmask/seq_masker_window_pattern.hpp>
40 #include <algo/winmask/seq_masker_window_pattern_ambig.hpp>
41 #include <algo/winmask/seq_masker_score_mean.hpp>
42 #include <algo/winmask/seq_masker_score_min.hpp>
43 #include <algo/winmask/seq_masker_score_mean_glob.hpp>
44 #include <algo/winmask/seq_masker.hpp>
45 #include <algo/winmask/seq_masker_util.hpp>
46 #include <algo/winmask/seq_masker_istat_factory.hpp>
47 #include <algo/winmask/seq_masker_cache_boost.hpp>
48 
49 #include <algorithm>
50 #include <memory>
51 #include <sstream>
52 
53 
54 BEGIN_NCBI_SCOPE
55 USING_SCOPE(objects);
56 
57 #define WIN_MASK_ALGO_NAME "window-masker-algorithm"
58 #define WIN_MASK_ALGO_VER_MAJOR 1
59 #define WIN_MASK_ALGO_VER_MINOR 0
60 #define WIN_MASK_ALGO_VER_PATCH 0
61 
62 //-------------------------------------------------------------------------
63 CSeqMaskerVersion CSeqMasker::AlgoVersion(
64         WIN_MASK_ALGO_NAME,
65         WIN_MASK_ALGO_VER_MAJOR,
66         WIN_MASK_ALGO_VER_MINOR,
67         WIN_MASK_ALGO_VER_PATCH
68 );
69 
70 //-------------------------------------------------------------------------
CSeqMasker(const string & lstat_name,Uint1 arg_window_size,Uint4 arg_window_step,Uint1 arg_unit_step,Uint4 arg_textend,Uint4 arg_cutoff_score,Uint4 arg_max_score,Uint4 arg_min_score,Uint4 arg_set_max_score,Uint4 arg_set_min_score,bool arg_merge_pass,Uint4 arg_merge_cutoff_score,Uint4 arg_abs_merge_cutoff_dist,Uint4 arg_mean_merge_cutoff_dist,Uint1 arg_merge_unit_step,const string & arg_trigger,Uint1 tmin_count,bool arg_discontig,Uint4 arg_pattern,bool arg_use_ba)71 CSeqMasker::CSeqMasker( const string & lstat_name,
72                         Uint1 arg_window_size,
73                         Uint4 arg_window_step,
74                         Uint1 arg_unit_step,
75                         Uint4 arg_textend,
76                         Uint4 arg_cutoff_score,
77                         Uint4 arg_max_score,
78                         Uint4 arg_min_score,
79                         Uint4 arg_set_max_score,
80                         Uint4 arg_set_min_score,
81                         bool arg_merge_pass,
82                         Uint4 arg_merge_cutoff_score,
83                         Uint4 arg_abs_merge_cutoff_dist,
84                         Uint4 arg_mean_merge_cutoff_dist,
85                         Uint1 arg_merge_unit_step,
86                         const string & arg_trigger,
87                         Uint1 tmin_count,
88                         bool arg_discontig,
89                         Uint4 arg_pattern,
90                         bool arg_use_ba )
91     : ustat( CSeqMaskerIstatFactory::create( lstat_name,
92                                              arg_cutoff_score,
93                                              arg_textend,
94                                              arg_max_score,
95                                              arg_set_max_score,
96                                              arg_min_score,
97                                              arg_set_min_score,
98                                              arg_use_ba ) ),
99       score( NULL ), score_p3( NULL ), trigger_score( NULL ),
100       window_size( arg_window_size ), window_step( arg_window_step ),
101       unit_step( arg_unit_step ),
102       merge_pass( arg_merge_pass ),
103       merge_cutoff_score( arg_merge_cutoff_score ),
104       abs_merge_cutoff_dist( arg_abs_merge_cutoff_dist ),
105       mean_merge_cutoff_dist( arg_mean_merge_cutoff_dist ),
106       merge_unit_step( arg_merge_unit_step ),
107       trigger( arg_trigger == "mean" ? eTrigger_Mean
108                : eTrigger_Min ),
109       discontig( arg_discontig ), pattern( arg_pattern )
110 {
111     if( window_size == 0 ) window_size = ustat->UnitSize() + 4;
112 
113     if( window_size < ustat->UnitSize() ) {
114         std::ostringstream os;
115         os << "window size (" << (int)window_size << ") "
116               "must be greater or equal to unit size (" <<
117               (int)ustat->UnitSize() << ")";
118         NCBI_THROW( CSeqMaskerException, eValidation, os.str() );
119     }
120 
121     trigger_score = score = new CSeqMaskerScoreMean( ustat );
122 
123     if( trigger == eTrigger_Min )
124         trigger_score = new CSeqMaskerScoreMin( ustat, tmin_count );
125 
126     if( !score )
127     {
128         NCBI_THROW( CSeqMaskerException,
129                     eScoreAllocFail,
130                     "" );
131     }
132 
133     if( arg_merge_pass )
134     {
135         score_p3 = new CSeqMaskerScoreMeanGlob( ustat );
136 
137         if( !score )
138         {
139             NCBI_THROW( CSeqMaskerException,
140                         eScoreP3AllocFail,
141                         "" );
142         }
143     }
144 }
145 
146 //-------------------------------------------------------------------------
~CSeqMasker()147 CSeqMasker::~CSeqMasker()
148 {
149     if( trigger_score != score ) delete trigger_score;
150 
151     delete score;
152     delete score_p3;
153 }
154 
155 //-------------------------------------------------------------------------
156 CSeqMasker::TMaskList *
operator ()(const CSeqVector & data) const157 CSeqMasker::operator()( const CSeqVector& data ) const
158 { return DoMask( data, 0, data.size() ); }
159 
160 //-------------------------------------------------------------------------
161 CSeqMasker::TMaskList *
DoMask(const CSeqVector & data,TSeqPos begin,TSeqPos stop) const162 CSeqMasker::DoMask(
163     const CSeqVector& data, TSeqPos begin, TSeqPos stop ) const
164 {
165     ustat->total_ = 0;
166     auto_ptr<TMaskList> mask(new TMaskList);
167     Uint4 cutoff_score = ustat->get_threshold();
168     Uint4 textend = ustat->get_textend();
169     Uint1 nbits = discontig ? CSeqMaskerUtil::BitCount( pattern ) : 0;
170     Uint4 unit_size = ustat->UnitSize() + nbits;
171     auto_ptr<CSeqMaskerWindow> window_ptr
172         (discontig ? new CSeqMaskerWindowPattern( data, unit_size,
173                                                   window_size, window_step,
174                                                   pattern, unit_step )
175          : new CSeqMaskerWindow( data, unit_size,
176                                  window_size, window_step,
177                                  unit_step, begin, stop ));
178     CSeqMaskerWindow & window = *window_ptr;
179     score->SetWindow( window );
180 
181     if( trigger == eTrigger_Min ) trigger_score->SetWindow( window );
182 
183     Uint4 start = 0, end = 0, cend = 0;
184     Uint4 limit = textend;
185     const CSeqMaskerIstat::optimization_data * od
186         = ustat->get_optimization_data();
187 
188     CSeqMaskerCacheBoost booster( window, od );
189 
190     while( window )
191     {
192         Uint4 ts = (*trigger_score)();
193         Uint4 s = (*score)();
194         Uint4 adv = window_step;
195 
196         if( s < limit )
197         {
198             if( end > start )
199             {
200                 if( window.Start() > cend )
201                 {
202                     mask->push_back( TMaskedInterval( start, end ) );
203                     start = end = cend = 0;
204                 }
205             }
206 
207             if( od != 0 && od->cba_ != 0 )
208             {
209                 adv = window.Start();
210 
211                 if( !booster.Check() )
212                     break;
213 
214                 adv = window_step*( 1 + window.Start() - adv );
215             }
216         }
217         else if( ts < cutoff_score )
218         {
219             if( end  > start )
220             {
221                 if( window.Start() > cend + 1 )
222                 {
223                     mask->push_back( TMaskedInterval( start, end ) );
224                     start = end = cend = 0;
225                 }
226                 else cend = window.End();
227             }
228         }
229         else
230         {
231             if( end > start )
232             {
233                 if( window.Start() > cend + 1 )
234                 {
235                     mask->push_back( TMaskedInterval( start, end ) );
236                     start = window.Start();
237                 }
238             }
239             else start = window.Start();
240 
241             cend = end = window.End();
242         }
243 
244 
245         if( adv == window_step )
246             ++window;
247 
248         score->PostAdvance( adv );
249     }
250 
251     if( end > start )
252         mask->push_back( TMaskedInterval( start, end ) );
253 
254     window_ptr.reset();
255 
256     if( merge_pass )
257     {
258         Uint1 nbits = discontig ? CSeqMaskerUtil::BitCount( pattern ) : 0;
259         Uint4 unit_size = ustat->UnitSize() + nbits;
260 
261         if( mask->size() < 2 ) return mask.release();
262 
263         TMList masked, unmasked;
264         TMaskList::iterator jtmp = mask->end();
265 
266         {{
267              for( TMaskList::iterator i = mask->begin(), j = --jtmp;
268                   i != j; )
269              {
270                  masked.push_back( mitem( i->first, i->second, unit_size,
271                                           data, *this ) );
272                  Uint4 nstart = (i++)->second - unit_size + 2;
273                  unmasked.push_back( mitem( nstart, i->first + unit_size - 2,
274                                             unit_size, data, *this ) );
275              }
276 
277              masked.push_back( mitem( (mask->rbegin())->first,
278                                       (mask->rbegin())->second,
279                                       unit_size, data, *this ) );
280          }}
281 
282         Int4 count = 0;
283         TMList::iterator ii = masked.begin();
284         TMList::iterator j = unmasked.begin();
285         TMList::iterator k = ii, l = ii;
286         --k; ++l;
287 
288         for( ; ii != masked.end(); k = l = ii, --k, ++l )
289         {
290             Uint4 ldist = (ii != masked.begin())
291                 ? ii->start - k->end - 1 : 0;
292             TMList::iterator tmpend = masked.end();
293             --tmpend;
294             Uint4 rdist = (ii != tmpend)
295                 ? l->start - ii->end - 1 : 0;
296             double lavg = 0.0, ravg = 0.0;
297             bool can_go_left =  count && ldist
298                 && ldist <= mean_merge_cutoff_dist;
299             bool can_go_right =  rdist
300                 && rdist <= mean_merge_cutoff_dist;
301 
302             if( can_go_left )
303             {
304                 TMList::iterator tmp = j; --tmp;
305                 lavg = MergeAvg( k, tmp, unit_size );
306                 can_go_left = can_go_left && (lavg >= merge_cutoff_score);
307             }
308 
309             if( can_go_right )
310             {
311                 ravg = MergeAvg( ii, j, unit_size );
312                 can_go_right = can_go_right && (ravg >= merge_cutoff_score);
313             }
314 
315             if( can_go_right )
316             {
317                 if( can_go_left )
318                 {
319                     if( ravg >= lavg )
320                     {
321                         ++count;
322                         ++ii;
323                         ++j;
324                     }
325                     else // count must be greater than 0.
326                     {
327                         --count;
328                         k->avg = MergeAvg( k, --j, unit_size );
329                         _TRACE( "Merging "
330                                 << k->start << " - " << k->end
331                                 << " and "
332                                 << ii->start << " - " << ii->end );
333                         Merge( masked, k, unmasked, j );
334 
335                         if( count )
336                         {
337                             ii = --k;
338                             --j;
339                             --count;
340                         }
341                         else ii = k;
342                     }
343                 }
344                 else
345                 {
346                     ++count;
347                     ++ii;
348                     ++j;
349                 }
350             }
351             else if( can_go_left )
352             {
353                 --count;
354                 k->avg = MergeAvg( k, --j, unit_size );
355                 _TRACE( "Merging "
356                         << k->start << " - " << k->end
357                         << " and "
358                         << ii->start << " - " << ii->end );
359                 Merge( masked, k, unmasked, j );
360 
361                 if( count )
362                 {
363                     ii = --k;
364                     --j;
365                     --count;
366                 }
367                 else ii = k;
368             }
369             else
370             {
371                 ++ii;
372                 ++j;
373                 count = 0;
374             }
375         }
376 
377         for( ii = masked.begin(), j = unmasked.begin(), k = ii++;
378              ii != masked.end(); (k = ii++), j++ )
379         {
380             if( k->end + abs_merge_cutoff_dist >= ii->start )
381             {
382                 _TRACE( "Unconditionally merging "
383                         << k->start << " - " << k->end
384                         << " and "
385                         << ii->start << " - " << ii->end );
386                 k->avg = MergeAvg( k, j, unit_size );
387                 Merge( masked, k, unmasked, j );
388                 ii = k;
389 
390                 if( ++ii == masked.end() ) break;
391             }
392         }
393 
394         mask->clear();
395 
396         for( TMList::const_iterator iii = masked.begin(); iii != masked.end(); ++iii )
397             mask->push_back( TMaskedInterval( iii->start, iii->end ) );
398     }
399 
400     return mask.release();
401 }
402 
403 //-------------------------------------------------------------------------
MergeAvg(TMList::iterator mi,const TMList::iterator & umi,Uint4 unit_size) const404 double CSeqMasker::MergeAvg( TMList::iterator mi,
405                              const TMList::iterator & umi,
406                              Uint4 unit_size ) const
407 {
408     TMList::iterator tmp = mi++;
409     Uint4 n1 = (tmp->end - tmp->start - unit_size + 2)/merge_unit_step;
410     Uint4 n2 = (umi->end - umi->start - unit_size + 2)/merge_unit_step;
411     Uint4 n3 = (mi->end - mi->start - unit_size + 2)/merge_unit_step;
412     Uint4 N = (mi->end - tmp->start - unit_size + 2)/merge_unit_step;
413     double a1 = tmp->avg, a2 = umi->avg, a3 = mi->avg;
414     return (a1*n1 + a2*n2 + a3*n3)/N;
415 }
416 
417 //-------------------------------------------------------------------------
Merge(TMList & m,TMList::iterator mi,TMList & um,TMList::iterator & umi) const418 void CSeqMasker::Merge( TMList & m, TMList::iterator mi,
419                         TMList & um, TMList::iterator & umi ) const
420 {
421     TMList::iterator tmp = mi++;
422     tmp->end = mi->end;
423     m.erase( mi );
424     umi = um.erase( umi );
425 }
426 
427 //----------------------------------------------------------------------------
GetErrCodeString() const428 const char * CSeqMasker::CSeqMaskerException::GetErrCodeString() const
429 {
430     switch( GetErrCode() )
431     {
432     case eLstatStreamIpenFail:
433 
434         return "can not open input stream";
435 
436     case eLstatSyntax:
437 
438         return "syntax error";
439 
440     case eLstatParam:
441 
442         return  "the following parameters could not be determined"
443                 " from the unit frequency database or command line: ";
444 
445     case eScoreAllocFail:
446 
447         return "score function object allocation failed";
448 
449     case eScoreP3AllocFail:
450 
451         return "merge pass score function object allocation failed";
452 
453     case eValidation:
454 
455         return "validation error";
456 
457     default:
458 
459         return CException::GetErrCodeString();
460     }
461 }
462 
463 //----------------------------------------------------------------------------
mitem(Uint4 arg_start,Uint4 arg_end,Uint1 unit_size,const CSeqVector & data,const CSeqMasker & owner)464 CSeqMasker::mitem::mitem( Uint4 arg_start, Uint4 arg_end, Uint1 unit_size,
465                           const CSeqVector & data, const CSeqMasker & owner )
466     : start( arg_start ), end( arg_end ), avg( 0.0 )
467 {
468     const Uint1 & window_size = owner.window_size;
469     const CSeqMaskerWindow::TUnit & ambig_unit = owner.ustat->AmbigUnit();
470     CSeqMaskerScore * const score = owner.score_p3;
471     CSeqMaskerWindow * window = NULL;
472 
473     if( owner.discontig )
474         window = new CSeqMaskerWindowPatternAmbig( data, unit_size, window_size,
475                                                    owner.merge_unit_step,
476                                                    owner.pattern, ambig_unit,
477                                                    start,
478                                                    owner.merge_unit_step );
479     else
480         window = new CSeqMaskerWindowAmbig( data, unit_size, window_size,
481                                             owner.merge_unit_step,
482                                             ambig_unit, start,
483                                             owner.merge_unit_step );
484 
485     score->SetWindow( *window );
486     Uint4 step = window->Step();
487 
488     while( window->End() < end )
489     {
490         score->PreAdvance( step );
491         ++*window;
492         score->PostAdvance( step );
493     }
494 
495     avg = (*score)();
496     delete window;
497 }
498 
499 //----------------------------------------------------------------------------
MergeMaskInfo(TMaskList * dest,const TMaskList * src)500 void CSeqMasker::MergeMaskInfo( TMaskList * dest, const TMaskList * src )
501 {
502     if( src->empty() )
503         return;
504 
505     TMaskList::const_iterator si( src->begin() );
506     TMaskList::const_iterator send( src->end() );
507     TMaskList::iterator di( dest->begin() );
508     TMaskList::iterator dend( dest->end() );
509     TMaskList res;
510     TMaskedInterval seg;
511     TMaskedInterval next_seg;
512 
513     if( di != dend && di->first < si->first )
514         seg = *(di++);
515     else seg = *(si++);
516 
517     while( true )
518     {
519         if( si != send ) {
520             if( di != dend ) {
521                 if( si->first < di->first ) {
522                     next_seg = *(si++);
523                 } else {
524                     next_seg = *(di++);
525                 }
526             } else {
527                 next_seg = *(si++);
528             }
529         } else if( di != dend ) {
530             next_seg = *(di++);
531         } else {
532             break;
533         }
534 
535         if( seg.second + 1 < next_seg.first ) {
536             res.push_back( seg );
537             seg = next_seg;
538         }
539         else if( seg.second < next_seg.second ) {
540             seg.second = next_seg.second;
541         }
542     }
543 
544     res.push_back( seg );
545     dest->swap(res);
546 }
547 
548 
549 END_NCBI_SCOPE
550