1 /* $Id: symdust.hpp 486434 2015-12-04 14:20:02Z fongah2 $ 2 * =========================================================================== 3 * 4 * PUBLIC DOMAIN NOTICE 5 * National Center for Biotechnology Information 6 * 7 * This software/database is a "United States Government Work" under the 8 * terms of the United States Copyright Act. It was written as part of 9 * the author's official duties as a United States Government employee and 10 * thus cannot be copyrighted. This software/database is freely available 11 * to the public for use. The National Library of Medicine and the U.S. 12 * Government have not placed any restriction on its use or reproduction. 13 * 14 * Although all reasonable efforts have been taken to ensure the accuracy 15 * and reliability of the software and data, the NLM and the U.S. 16 * Government do not and cannot warrant the performance or results that 17 * may be obtained by using this software or data. The NLM and the U.S. 18 * Government disclaim all warranties, express or implied, including 19 * warranties of performance, merchantability or fitness for any particular 20 * purpose. 21 * 22 * Please cite the author in any work or product based on this material. 23 * 24 * =========================================================================== 25 * 26 * Author: Aleksandr Morgulis 27 * 28 * File Description: 29 * Header file for CSymDustMasker class. 30 * 31 */ 32 33 #ifndef C_SYM_DUST_MASKER_HPP 34 #define C_SYM_DUST_MASKER_HPP 35 36 #include <corelib/ncbitype.h> 37 #include <corelib/ncbistr.hpp> 38 #include <corelib/ncbiobj.hpp> 39 40 #include <objects/seqloc/Seq_loc.hpp> 41 #include <objects/seqloc/Packed_seqint.hpp> 42 #include <objmgr/seq_vector.hpp> 43 #include <util/random_gen.hpp> 44 45 #include <iostream> 46 #include <vector> 47 #include <algorithm> 48 #include <iterator> 49 #include <memory> 50 #include <deque> 51 #include <list> 52 53 54 BEGIN_NCBI_SCOPE 55 56 /** 57 \brief Looks for low complexity parts of sequences according to the 58 symmetric version of DUST algorithm. 59 */ 60 class NCBI_XALGODUSTMASK_EXPORT CSymDustMasker 61 { 62 private: 63 64 /** \internal 65 \brief Function class responsible for conversion from IUPACNA 66 to NCBI2NA coding. 67 */ 68 struct CIupac2Ncbi2na_converter 69 { 70 /** \internal 71 \brief Operator performing the actual conversion. 72 \param r base letter in IOPACNA encoding 73 \return the same letter in NCBI2NA encoding 74 */ operator ()CSymDustMasker::CIupac2Ncbi2na_converter75 Uint1 operator()( Uint1 r ) 76 { 77 switch( r ) 78 { 79 case 67: return 1; 80 case 71: return 2; 81 case 84: return 3; 82 case 78: return (m_Random.GetRand() & 0x3); 83 default: return 0; 84 } 85 } 86 CRandom m_Random; 87 }; 88 89 typedef objects::CSeqVector seq_t; /**<\internal Sequence type. */ 90 typedef CIupac2Ncbi2na_converter convert_t; /**<\internal Converter type. */ 91 92 public: 93 94 /**\brief Public sequence type. */ 95 typedef seq_t sequence_type; 96 /**\brief Integer size type corresponding to sequence_type. */ 97 typedef sequence_type::size_type size_type; 98 /**\brief Type respresenting an interval selected for masking. */ 99 typedef std::pair< size_type, size_type > TMaskedInterval; 100 /**\brief Type representing a list of masked intervals. */ 101 typedef std::vector< TMaskedInterval > TMaskList; 102 103 static const Uint4 DEFAULT_LEVEL = 20; /**< Default value of score threshold. */ 104 static const Uint4 DEFAULT_WINDOW = 64; /**< Default window size. */ 105 static const Uint4 DEFAULT_LINKER = 1; /**< Default value of the longest distance between 106 consequtive masked intervals at which they 107 should be merged. */ 108 109 // These (up to constructor) are public to work around the bug in SUN C++ compiler. 110 111 /** \internal 112 \brief Type representing a perfect interval. 113 */ 114 struct perfect 115 { 116 TMaskedInterval bounds_; /**<\internal The actual interval. */ 117 Uint4 score_; /**<\internal The score of the interval. */ 118 size_type len_; /**<\internal The length of the interval. */ 119 120 /** \internal 121 \brief Object constructor. 122 \param start position of the left end 123 \param stop position of the right end 124 \param score the score 125 \param len the length 126 */ perfectCSymDustMasker::perfect127 perfect( size_type start, size_type stop, Uint4 score, size_type len ) 128 : bounds_( start, stop ), score_( score ), len_( len ) 129 {} 130 }; 131 132 /**\brief Type representing a list of perfect intervals. */ 133 typedef std::list< perfect > perfect_list_type; 134 /**\brief Table type to store score sum thresholds for each window length. */ 135 typedef std::vector< Uint4 > thres_table_type; 136 /**\brief Type representing a triplet value. */ 137 typedef Uint1 triplet_type; 138 139 /**\brief Selects the significant bits in triplet_type. */ 140 static const triplet_type TRIPLET_MASK = 0x3F; 141 142 /** 143 \brief Object constructor. 144 \param level score threshold 145 \param window max window size 146 \param linker max distance at which to merge consequtive masked intervals 147 */ 148 CSymDustMasker( Uint4 level = DEFAULT_LEVEL, 149 size_type window 150 = static_cast< size_type >( DEFAULT_WINDOW ), 151 size_type linker 152 = static_cast< size_type >( DEFAULT_LINKER ) ); 153 154 /** 155 \brief Mask a sequence. 156 \param seq a sequence to mask 157 \return list of masked intervals 158 */ 159 std::auto_ptr< TMaskList > operator()( const sequence_type & seq ); 160 161 /** 162 \brief Mask a part of the sequence. 163 \param seq the sequence to mask 164 \param start beginning position of the subsequence to mask 165 \param stop ending position of the subsequence to mask 166 \return list of masked intervals 167 */ 168 std::auto_ptr< TMaskList > operator()( const sequence_type & seq, 169 size_type start, size_type stop ); 170 171 /** 172 \brief Mask a sequence and return result as a sequence of CSeq_loc 173 objects. 174 \param seq_id sequence id 175 \param seq the sequence 176 \param [out] vector of const (smart) references to CSeq_loc 177 */ 178 void GetMaskedLocs( 179 objects::CSeq_id & seq_id, 180 const sequence_type & seq, 181 std::vector< CConstRef< objects::CSeq_loc > > & locs ); 182 183 /**\brief Mask a sequence and return result as a CPacked_seqint 184 instance. 185 \param seq_id sequence id 186 \param seq the sequence 187 */ 188 CRef< objects::CPacked_seqint > GetMaskedInts( 189 objects::CSeq_id & seq_id, const sequence_type & seq ); 190 191 private: 192 193 /**\internal Sequence iterator type. */ 194 typedef sequence_type::const_iterator seq_citer_type; 195 196 /** \internal 197 \brief Class representing the set of triplets in a window. 198 */ 199 class triplets 200 { 201 public: 202 203 /** \internal 204 \brief Object constructor. 205 \param window max window size 206 \param low_k max triplet multiplicity that guarantees that 207 the window score is not above the threshold 208 \param perfect_list [in/out] current list of perfect intervals 209 \param thresholds table of threshold values for each window size 210 */ 211 triplets( size_type window, 212 Uint1 low_k, 213 perfect_list_type & perfect_list, 214 thres_table_type & thresholds ); 215 start() const216 size_type start() const { return start_; } /**<\internal Get position of the first triplet. */ stop() const217 size_type stop() const { return stop_; } /**<\internal Get position of the last triplet. */ size() const218 size_type size() const { return triplet_list_.size(); } /**<\internal Get the number of triplets. */ 219 220 /** \internal 221 \brief Update the list of perfect intervals with with suffixes 222 of the current window. 223 */ 224 void find_perfect(); 225 226 /** \internal 227 \brief Shift the window one base to the right using triplet 228 value t. 229 \param t the triplet value to add to the right end of the 230 triplet list 231 \return false, if the new window contains a single triplet value; 232 true otherwise 233 */ 234 bool shift_window( triplet_type t ); 235 236 /** \internal 237 \brief Shift a single triplet window using a triplet value t. 238 \param t the triplet value to add to the right end of the 239 triplet list 240 \return false, if the new window contains a single triplet value; 241 true otherwise 242 */ 243 bool shift_high( triplet_type t ); 244 245 /** \internal 246 \brief Check the condition of Proposition 2 allowing 247 to skip the window processing. 248 \return true if the window requires suffix processing, 249 false otherwise 250 */ needs_processing() const251 bool needs_processing() const 252 { 253 Uint4 count = stop_ - L; 254 return count < triplet_list_.size() && 255 10*r_w > thresholds_[count]; 256 } 257 258 private: 259 260 /**\internal Implementation type for triplets list. */ 261 typedef std::deque< triplet_type > impl_type; 262 /**\internal Triplets list iterator type. */ 263 typedef impl_type::const_iterator impl_citer_type; 264 /**\internal Type for triplet counts tables. */ 265 typedef Uint1 counts_type[64]; 266 267 /** \internal 268 \brief Recompute the value of the running sum 269 and the triplet counts when a new triplet 270 is added. 271 \param r the running sum 272 \param c the triplet counts 273 \param t the new triplet value 274 */ add_triplet_info(Uint4 & r,counts_type & c,triplet_type t)275 void add_triplet_info( 276 Uint4 & r, counts_type & c, triplet_type t ) 277 { r += c[t]; ++c[t]; } 278 279 /** \internal 280 \brief Recompute the value of the running sum 281 and the triplet counts when a triplet 282 is removed. 283 \param r the running sum 284 \param c the triplet counts 285 \param t the triplet value being removed 286 */ rem_triplet_info(Uint4 & r,counts_type & c,triplet_type t)287 void rem_triplet_info( 288 Uint4 & r, counts_type & c, triplet_type t ) 289 { --c[t]; r -= c[t]; } 290 291 impl_type triplet_list_; /**<\internal The triplet list. */ 292 293 size_type start_; /**<\internal Position of the first triplet in the window. */ 294 size_type stop_; /**<\internal Position of the last triplet in the window. */ 295 size_type max_size_; /**<\internal Maximum window size. */ 296 297 Uint1 low_k_; /**<\internal Max triplet multiplicity that guarantees that 298 that the window score is not above the threshold. */ 299 Uint4 L; /**<\internal Position of the start of the window suffix 300 corresponding to low_k_. */ 301 302 perfect_list_type & P; /**<\internal Current list of perfect subintervals. */ 303 thres_table_type & thresholds_; /**<\internal Table containing thresholds for each 304 value of window length. */ 305 306 counts_type c_w; /**<\internal Table of triplet counts for the whole window. */ 307 counts_type c_v; /**<\internal Table of triplet counts for the window suffix. */ 308 Uint4 r_w; /**<\internal Running sum for the whole window. */ 309 Uint4 r_v; /**<\internal Running sum for the window suffix. */ 310 Uint4 num_diff; /**<\internal Number of different triplets values. */ 311 }; 312 313 /** \internal 314 \brief Merge perfect intervals into the result list. 315 \param res the result list 316 \param w the list of perfect intervals 317 \param start the start position of the subsequence 318 */ 319 void save_masked_regions( 320 TMaskList & res, size_type w, size_type start ); 321 322 Uint4 level_; /**<\internal Score threshold. */ 323 size_type window_; /**<\internal Max window size. */ 324 size_type linker_; /**<\internal Max distance at which consequtive masked intervals should be merged. */ 325 326 Uint1 low_k_; /**<\internal max triplet multiplicity guaranteeing not to exceed score threshold. */ 327 328 perfect_list_type P; /**<\internal List of perfect intervals within current window. */ 329 thres_table_type thresholds_; /**<\internal Table containing score thresholds for each window size. */ 330 331 convert_t converter_; /**\internal IUPACNA to NCBI2NA converter object. */ 332 }; 333 334 END_NCBI_SCOPE 335 336 #endif 337