1 /* $Id: seq_masker.cpp 464810 2015-04-14 16:29:42Z vakatov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Aleksandr Morgulis
27 *
28 * File Description:
29 * CSeqMasker class member and method definitions.
30 *
31 */
32
33 #include <ncbi_pch.hpp>
34
35 #include <corelib/ncbi_limits.h>
36
37 #include <algo/winmask/seq_masker_window.hpp>
38 #include <algo/winmask/seq_masker_window_ambig.hpp>
39 #include <algo/winmask/seq_masker_window_pattern.hpp>
40 #include <algo/winmask/seq_masker_window_pattern_ambig.hpp>
41 #include <algo/winmask/seq_masker_score_mean.hpp>
42 #include <algo/winmask/seq_masker_score_min.hpp>
43 #include <algo/winmask/seq_masker_score_mean_glob.hpp>
44 #include <algo/winmask/seq_masker.hpp>
45 #include <algo/winmask/seq_masker_util.hpp>
46 #include <algo/winmask/seq_masker_istat_factory.hpp>
47 #include <algo/winmask/seq_masker_cache_boost.hpp>
48
49 #include <algorithm>
50 #include <memory>
51 #include <sstream>
52
53
54 BEGIN_NCBI_SCOPE
55 USING_SCOPE(objects);
56
57 #define WIN_MASK_ALGO_NAME "window-masker-algorithm"
58 #define WIN_MASK_ALGO_VER_MAJOR 1
59 #define WIN_MASK_ALGO_VER_MINOR 0
60 #define WIN_MASK_ALGO_VER_PATCH 0
61
62 //-------------------------------------------------------------------------
63 CSeqMaskerVersion CSeqMasker::AlgoVersion(
64 WIN_MASK_ALGO_NAME,
65 WIN_MASK_ALGO_VER_MAJOR,
66 WIN_MASK_ALGO_VER_MINOR,
67 WIN_MASK_ALGO_VER_PATCH
68 );
69
70 //-------------------------------------------------------------------------
CSeqMasker(const string & lstat_name,Uint1 arg_window_size,Uint4 arg_window_step,Uint1 arg_unit_step,Uint4 arg_textend,Uint4 arg_cutoff_score,Uint4 arg_max_score,Uint4 arg_min_score,Uint4 arg_set_max_score,Uint4 arg_set_min_score,bool arg_merge_pass,Uint4 arg_merge_cutoff_score,Uint4 arg_abs_merge_cutoff_dist,Uint4 arg_mean_merge_cutoff_dist,Uint1 arg_merge_unit_step,const string & arg_trigger,Uint1 tmin_count,bool arg_discontig,Uint4 arg_pattern,bool arg_use_ba)71 CSeqMasker::CSeqMasker( const string & lstat_name,
72 Uint1 arg_window_size,
73 Uint4 arg_window_step,
74 Uint1 arg_unit_step,
75 Uint4 arg_textend,
76 Uint4 arg_cutoff_score,
77 Uint4 arg_max_score,
78 Uint4 arg_min_score,
79 Uint4 arg_set_max_score,
80 Uint4 arg_set_min_score,
81 bool arg_merge_pass,
82 Uint4 arg_merge_cutoff_score,
83 Uint4 arg_abs_merge_cutoff_dist,
84 Uint4 arg_mean_merge_cutoff_dist,
85 Uint1 arg_merge_unit_step,
86 const string & arg_trigger,
87 Uint1 tmin_count,
88 bool arg_discontig,
89 Uint4 arg_pattern,
90 bool arg_use_ba )
91 : ustat( CSeqMaskerIstatFactory::create( lstat_name,
92 arg_cutoff_score,
93 arg_textend,
94 arg_max_score,
95 arg_set_max_score,
96 arg_min_score,
97 arg_set_min_score,
98 arg_use_ba ) ),
99 score( NULL ), score_p3( NULL ), trigger_score( NULL ),
100 window_size( arg_window_size ), window_step( arg_window_step ),
101 unit_step( arg_unit_step ),
102 merge_pass( arg_merge_pass ),
103 merge_cutoff_score( arg_merge_cutoff_score ),
104 abs_merge_cutoff_dist( arg_abs_merge_cutoff_dist ),
105 mean_merge_cutoff_dist( arg_mean_merge_cutoff_dist ),
106 merge_unit_step( arg_merge_unit_step ),
107 trigger( arg_trigger == "mean" ? eTrigger_Mean
108 : eTrigger_Min ),
109 discontig( arg_discontig ), pattern( arg_pattern )
110 {
111 if( window_size == 0 ) window_size = ustat->UnitSize() + 4;
112
113 if( window_size < ustat->UnitSize() ) {
114 std::ostringstream os;
115 os << "window size (" << (int)window_size << ") "
116 "must be greater or equal to unit size (" <<
117 (int)ustat->UnitSize() << ")";
118 NCBI_THROW( CSeqMaskerException, eValidation, os.str() );
119 }
120
121 trigger_score = score = new CSeqMaskerScoreMean( ustat );
122
123 if( trigger == eTrigger_Min )
124 trigger_score = new CSeqMaskerScoreMin( ustat, tmin_count );
125
126 if( !score )
127 {
128 NCBI_THROW( CSeqMaskerException,
129 eScoreAllocFail,
130 "" );
131 }
132
133 if( arg_merge_pass )
134 {
135 score_p3 = new CSeqMaskerScoreMeanGlob( ustat );
136
137 if( !score )
138 {
139 NCBI_THROW( CSeqMaskerException,
140 eScoreP3AllocFail,
141 "" );
142 }
143 }
144 }
145
146 //-------------------------------------------------------------------------
~CSeqMasker()147 CSeqMasker::~CSeqMasker()
148 {
149 if( trigger_score != score ) delete trigger_score;
150
151 delete score;
152 delete score_p3;
153 }
154
155 //-------------------------------------------------------------------------
156 CSeqMasker::TMaskList *
operator ()(const CSeqVector & data) const157 CSeqMasker::operator()( const CSeqVector& data ) const
158 { return DoMask( data, 0, data.size() ); }
159
160 //-------------------------------------------------------------------------
161 CSeqMasker::TMaskList *
DoMask(const CSeqVector & data,TSeqPos begin,TSeqPos stop) const162 CSeqMasker::DoMask(
163 const CSeqVector& data, TSeqPos begin, TSeqPos stop ) const
164 {
165 ustat->total_ = 0;
166 auto_ptr<TMaskList> mask(new TMaskList);
167 Uint4 cutoff_score = ustat->get_threshold();
168 Uint4 textend = ustat->get_textend();
169 Uint1 nbits = discontig ? CSeqMaskerUtil::BitCount( pattern ) : 0;
170 Uint4 unit_size = ustat->UnitSize() + nbits;
171 auto_ptr<CSeqMaskerWindow> window_ptr
172 (discontig ? new CSeqMaskerWindowPattern( data, unit_size,
173 window_size, window_step,
174 pattern, unit_step )
175 : new CSeqMaskerWindow( data, unit_size,
176 window_size, window_step,
177 unit_step, begin, stop ));
178 CSeqMaskerWindow & window = *window_ptr;
179 score->SetWindow( window );
180
181 if( trigger == eTrigger_Min ) trigger_score->SetWindow( window );
182
183 Uint4 start = 0, end = 0, cend = 0;
184 Uint4 limit = textend;
185 const CSeqMaskerIstat::optimization_data * od
186 = ustat->get_optimization_data();
187
188 CSeqMaskerCacheBoost booster( window, od );
189
190 while( window )
191 {
192 Uint4 ts = (*trigger_score)();
193 Uint4 s = (*score)();
194 Uint4 adv = window_step;
195
196 if( s < limit )
197 {
198 if( end > start )
199 {
200 if( window.Start() > cend )
201 {
202 mask->push_back( TMaskedInterval( start, end ) );
203 start = end = cend = 0;
204 }
205 }
206
207 if( od != 0 && od->cba_ != 0 )
208 {
209 adv = window.Start();
210
211 if( !booster.Check() )
212 break;
213
214 adv = window_step*( 1 + window.Start() - adv );
215 }
216 }
217 else if( ts < cutoff_score )
218 {
219 if( end > start )
220 {
221 if( window.Start() > cend + 1 )
222 {
223 mask->push_back( TMaskedInterval( start, end ) );
224 start = end = cend = 0;
225 }
226 else cend = window.End();
227 }
228 }
229 else
230 {
231 if( end > start )
232 {
233 if( window.Start() > cend + 1 )
234 {
235 mask->push_back( TMaskedInterval( start, end ) );
236 start = window.Start();
237 }
238 }
239 else start = window.Start();
240
241 cend = end = window.End();
242 }
243
244
245 if( adv == window_step )
246 ++window;
247
248 score->PostAdvance( adv );
249 }
250
251 if( end > start )
252 mask->push_back( TMaskedInterval( start, end ) );
253
254 window_ptr.reset();
255
256 if( merge_pass )
257 {
258 Uint1 nbits = discontig ? CSeqMaskerUtil::BitCount( pattern ) : 0;
259 Uint4 unit_size = ustat->UnitSize() + nbits;
260
261 if( mask->size() < 2 ) return mask.release();
262
263 TMList masked, unmasked;
264 TMaskList::iterator jtmp = mask->end();
265
266 {{
267 for( TMaskList::iterator i = mask->begin(), j = --jtmp;
268 i != j; )
269 {
270 masked.push_back( mitem( i->first, i->second, unit_size,
271 data, *this ) );
272 Uint4 nstart = (i++)->second - unit_size + 2;
273 unmasked.push_back( mitem( nstart, i->first + unit_size - 2,
274 unit_size, data, *this ) );
275 }
276
277 masked.push_back( mitem( (mask->rbegin())->first,
278 (mask->rbegin())->second,
279 unit_size, data, *this ) );
280 }}
281
282 Int4 count = 0;
283 TMList::iterator ii = masked.begin();
284 TMList::iterator j = unmasked.begin();
285 TMList::iterator k = ii, l = ii;
286 --k; ++l;
287
288 for( ; ii != masked.end(); k = l = ii, --k, ++l )
289 {
290 Uint4 ldist = (ii != masked.begin())
291 ? ii->start - k->end - 1 : 0;
292 TMList::iterator tmpend = masked.end();
293 --tmpend;
294 Uint4 rdist = (ii != tmpend)
295 ? l->start - ii->end - 1 : 0;
296 double lavg = 0.0, ravg = 0.0;
297 bool can_go_left = count && ldist
298 && ldist <= mean_merge_cutoff_dist;
299 bool can_go_right = rdist
300 && rdist <= mean_merge_cutoff_dist;
301
302 if( can_go_left )
303 {
304 TMList::iterator tmp = j; --tmp;
305 lavg = MergeAvg( k, tmp, unit_size );
306 can_go_left = can_go_left && (lavg >= merge_cutoff_score);
307 }
308
309 if( can_go_right )
310 {
311 ravg = MergeAvg( ii, j, unit_size );
312 can_go_right = can_go_right && (ravg >= merge_cutoff_score);
313 }
314
315 if( can_go_right )
316 {
317 if( can_go_left )
318 {
319 if( ravg >= lavg )
320 {
321 ++count;
322 ++ii;
323 ++j;
324 }
325 else // count must be greater than 0.
326 {
327 --count;
328 k->avg = MergeAvg( k, --j, unit_size );
329 _TRACE( "Merging "
330 << k->start << " - " << k->end
331 << " and "
332 << ii->start << " - " << ii->end );
333 Merge( masked, k, unmasked, j );
334
335 if( count )
336 {
337 ii = --k;
338 --j;
339 --count;
340 }
341 else ii = k;
342 }
343 }
344 else
345 {
346 ++count;
347 ++ii;
348 ++j;
349 }
350 }
351 else if( can_go_left )
352 {
353 --count;
354 k->avg = MergeAvg( k, --j, unit_size );
355 _TRACE( "Merging "
356 << k->start << " - " << k->end
357 << " and "
358 << ii->start << " - " << ii->end );
359 Merge( masked, k, unmasked, j );
360
361 if( count )
362 {
363 ii = --k;
364 --j;
365 --count;
366 }
367 else ii = k;
368 }
369 else
370 {
371 ++ii;
372 ++j;
373 count = 0;
374 }
375 }
376
377 for( ii = masked.begin(), j = unmasked.begin(), k = ii++;
378 ii != masked.end(); (k = ii++), j++ )
379 {
380 if( k->end + abs_merge_cutoff_dist >= ii->start )
381 {
382 _TRACE( "Unconditionally merging "
383 << k->start << " - " << k->end
384 << " and "
385 << ii->start << " - " << ii->end );
386 k->avg = MergeAvg( k, j, unit_size );
387 Merge( masked, k, unmasked, j );
388 ii = k;
389
390 if( ++ii == masked.end() ) break;
391 }
392 }
393
394 mask->clear();
395
396 for( TMList::const_iterator iii = masked.begin(); iii != masked.end(); ++iii )
397 mask->push_back( TMaskedInterval( iii->start, iii->end ) );
398 }
399
400 return mask.release();
401 }
402
403 //-------------------------------------------------------------------------
MergeAvg(TMList::iterator mi,const TMList::iterator & umi,Uint4 unit_size) const404 double CSeqMasker::MergeAvg( TMList::iterator mi,
405 const TMList::iterator & umi,
406 Uint4 unit_size ) const
407 {
408 TMList::iterator tmp = mi++;
409 Uint4 n1 = (tmp->end - tmp->start - unit_size + 2)/merge_unit_step;
410 Uint4 n2 = (umi->end - umi->start - unit_size + 2)/merge_unit_step;
411 Uint4 n3 = (mi->end - mi->start - unit_size + 2)/merge_unit_step;
412 Uint4 N = (mi->end - tmp->start - unit_size + 2)/merge_unit_step;
413 double a1 = tmp->avg, a2 = umi->avg, a3 = mi->avg;
414 return (a1*n1 + a2*n2 + a3*n3)/N;
415 }
416
417 //-------------------------------------------------------------------------
Merge(TMList & m,TMList::iterator mi,TMList & um,TMList::iterator & umi) const418 void CSeqMasker::Merge( TMList & m, TMList::iterator mi,
419 TMList & um, TMList::iterator & umi ) const
420 {
421 TMList::iterator tmp = mi++;
422 tmp->end = mi->end;
423 m.erase( mi );
424 umi = um.erase( umi );
425 }
426
427 //----------------------------------------------------------------------------
GetErrCodeString() const428 const char * CSeqMasker::CSeqMaskerException::GetErrCodeString() const
429 {
430 switch( GetErrCode() )
431 {
432 case eLstatStreamIpenFail:
433
434 return "can not open input stream";
435
436 case eLstatSyntax:
437
438 return "syntax error";
439
440 case eLstatParam:
441
442 return "the following parameters could not be determined"
443 " from the unit frequency database or command line: ";
444
445 case eScoreAllocFail:
446
447 return "score function object allocation failed";
448
449 case eScoreP3AllocFail:
450
451 return "merge pass score function object allocation failed";
452
453 case eValidation:
454
455 return "validation error";
456
457 default:
458
459 return CException::GetErrCodeString();
460 }
461 }
462
463 //----------------------------------------------------------------------------
mitem(Uint4 arg_start,Uint4 arg_end,Uint1 unit_size,const CSeqVector & data,const CSeqMasker & owner)464 CSeqMasker::mitem::mitem( Uint4 arg_start, Uint4 arg_end, Uint1 unit_size,
465 const CSeqVector & data, const CSeqMasker & owner )
466 : start( arg_start ), end( arg_end ), avg( 0.0 )
467 {
468 const Uint1 & window_size = owner.window_size;
469 const CSeqMaskerWindow::TUnit & ambig_unit = owner.ustat->AmbigUnit();
470 CSeqMaskerScore * const score = owner.score_p3;
471 CSeqMaskerWindow * window = NULL;
472
473 if( owner.discontig )
474 window = new CSeqMaskerWindowPatternAmbig( data, unit_size, window_size,
475 owner.merge_unit_step,
476 owner.pattern, ambig_unit,
477 start,
478 owner.merge_unit_step );
479 else
480 window = new CSeqMaskerWindowAmbig( data, unit_size, window_size,
481 owner.merge_unit_step,
482 ambig_unit, start,
483 owner.merge_unit_step );
484
485 score->SetWindow( *window );
486 Uint4 step = window->Step();
487
488 while( window->End() < end )
489 {
490 score->PreAdvance( step );
491 ++*window;
492 score->PostAdvance( step );
493 }
494
495 avg = (*score)();
496 delete window;
497 }
498
499 //----------------------------------------------------------------------------
MergeMaskInfo(TMaskList * dest,const TMaskList * src)500 void CSeqMasker::MergeMaskInfo( TMaskList * dest, const TMaskList * src )
501 {
502 if( src->empty() )
503 return;
504
505 TMaskList::const_iterator si( src->begin() );
506 TMaskList::const_iterator send( src->end() );
507 TMaskList::iterator di( dest->begin() );
508 TMaskList::iterator dend( dest->end() );
509 TMaskList res;
510 TMaskedInterval seg;
511 TMaskedInterval next_seg;
512
513 if( di != dend && di->first < si->first )
514 seg = *(di++);
515 else seg = *(si++);
516
517 while( true )
518 {
519 if( si != send ) {
520 if( di != dend ) {
521 if( si->first < di->first ) {
522 next_seg = *(si++);
523 } else {
524 next_seg = *(di++);
525 }
526 } else {
527 next_seg = *(si++);
528 }
529 } else if( di != dend ) {
530 next_seg = *(di++);
531 } else {
532 break;
533 }
534
535 if( seg.second + 1 < next_seg.first ) {
536 res.push_back( seg );
537 seg = next_seg;
538 }
539 else if( seg.second < next_seg.second ) {
540 seg.second = next_seg.second;
541 }
542 }
543
544 res.push_back( seg );
545 dest->swap(res);
546 }
547
548
549 END_NCBI_SCOPE
550