1 /* $Id: seq_masker.hpp 575325 2018-11-27 18:22:00Z ucko $ 2 * =========================================================================== 3 * 4 * PUBLIC DOMAIN NOTICE 5 * National Center for Biotechnology Information 6 * 7 * This software/database is a "United States Government Work" under the 8 * terms of the United States Copyright Act. It was written as part of 9 * the author's official duties as a United States Government employee and 10 * thus cannot be copyrighted. This software/database is freely available 11 * to the public for use. The National Library of Medicine and the U.S. 12 * Government have not placed any restriction on its use or reproduction. 13 * 14 * Although all reasonable efforts have been taken to ensure the accuracy 15 * and reliability of the software and data, the NLM and the U.S. 16 * Government do not and cannot warrant the performance or results that 17 * may be obtained by using this software or data. The NLM and the U.S. 18 * Government disclaim all warranties, express or implied, including 19 * warranties of performance, merchantability or fitness for any particular 20 * purpose. 21 * 22 * Please cite the author in any work or product based on this material. 23 * 24 * =========================================================================== 25 * 26 * Author: Aleksandr Morgulis 27 * 28 * File Description: 29 * Header file for CSeqMasker class. 30 * 31 */ 32 33 #ifndef C_SEQ_MASKER_H 34 #define C_SEQ_MASKER_H 35 36 #include <corelib/ncbitype.h> 37 #include <corelib/ncbistr.hpp> 38 #include <corelib/ncbiobj.hpp> 39 40 #include <algo/winmask/seq_masker_window.hpp> 41 #include <algo/winmask/seq_masker_istat.hpp> 42 #include <algo/winmask/seq_masker_version.hpp> 43 44 BEGIN_NCBI_SCOPE 45 46 class CSeqMaskerScore; 47 48 /** 49 **\brief Main interface to window based masker functionality. 50 ** 51 **/ 52 class NCBI_XALGOWINMASK_EXPORT CSeqMasker 53 { 54 public: 55 56 /// Version of window masking algorithm. 57 static CSeqMaskerVersion AlgoVersion; 58 59 /** 60 **\brief Type representing a masked interval within a sequence. 61 ** 62 ** If A is an object of type TMaskedInterval, then A.first is 63 ** the offset (starting from 0) of the beginning of the 64 ** interval; A.second is the offset of the end of the interval. 65 ** 66 **/ 67 typedef pair< TSeqPos, TSeqPos > TMaskedInterval; 68 69 /** 70 **\brief A type representing the total of masking information 71 ** about a sequence. 72 ** 73 **/ 74 typedef vector< TMaskedInterval > TMaskList; 75 76 /** 77 **\brief Represents different error situations that can occur 78 ** in the masking process. 79 **/ 80 class CSeqMaskerException : public CException 81 { 82 public: 83 84 /** 85 **\brief Integer error codes. 86 **/ 87 enum EErrCode 88 { 89 eLstatStreamIpenFail, /**< Error opening the length statistics file */ 90 eLstatSyntax, /**< Error parsing the length statistics file */ 91 eLstatParam, /**< Error deducing parameters from lstat or command line */ 92 eScoreAllocFail, /**< Error allocating the score function object */ 93 eScoreP3AllocFail, /**< Error allocating the score function object for merging pass */ 94 eValidation /**< Insconsistent internal parameters */ 95 }; 96 97 /** 98 **\brief Get the exception description string. 99 ** 100 ** The method translates internal error code in the exception 101 ** object into a human readable explanation string. 102 ** 103 **\return explanation string for the exception 104 ** 105 **/ 106 virtual const char * GetErrCodeString() const override; 107 108 NCBI_EXCEPTION_DEFAULT( CSeqMaskerException, CException ); 109 }; 110 111 /** 112 **\brief Merge together two result lists. 113 ** 114 ** Used to merge results lists obtained from winmask and dust 115 ** algorithms. 116 ** 117 **\param dest this list will contain the merged data 118 **\param src the other results list 119 **/ 120 static void MergeMaskInfo( TMaskList * dest, const TMaskList * src ); 121 122 /** 123 **\brief Object constructor. 124 ** 125 ** Parameters to the constructor determine the behaviour of the 126 ** window based masking procedure. 127 ** 128 **\param lstat_name the name of the file containing length statistics 129 **\param arg_window_size the window size in bps 130 **\param arg_window_step the window step 131 **\param arg_unit_step the unit step 132 **\param arg_textend the score above which it is allowed to keep masking 133 **\param arg_cutoff_score the unit score triggering the masking 134 **\param arg_max_score maximum allowed unit score 135 **\param arg_min_score minimum allowed unit score 136 **\param arg_set_max_score score to use for units exceeding max_score 137 **\param arg_set_min_score score to use for units below min_score 138 **\param arg_merge_pass whether or not to perform an interval merging pass 139 **\param arg_merge_cutoff_score combined average score at which intervals 140 ** should be merged 141 **\param arg_abs_merge_cutoff_dist maximum distance between intervals 142 ** at which they can be merged 143 ** unconditionally 144 **\param arg_mean_merge_cutoff_dist maximum distance between intervals 145 ** at which they can be merged if they 146 ** satisfy arg_merge_cutoff_score 147 ** threshold 148 **\param arg_merge_unit_step unit step to use for interval merging 149 **\param arg_trigger determines which method to use to trigger masking 150 **\param tmin_count if arg_trigger is "min" then determines how many of 151 ** the units in a window should be above the score 152 ** threshold in order to trigger masking 153 **\param arg_discontig whether or not to use discontiguous units 154 **\param arg_pattern base pattern to form discontiguous units 155 **\param arg_use_ba use bit array optimization, if available 156 ** 157 **/ 158 CSeqMasker( const string & lstat_name, 159 Uint1 arg_window_size, 160 Uint4 arg_window_step, 161 Uint1 arg_unit_step, 162 Uint4 arg_textend, 163 Uint4 arg_cutoff_score, 164 Uint4 arg_max_score, 165 Uint4 arg_min_score, 166 Uint4 arg_set_max_score, 167 Uint4 arg_set_min_score, 168 bool arg_merge_pass, 169 Uint4 arg_merge_cutoff_score, 170 Uint4 arg_abs_merge_cutoff_dist, 171 Uint4 arg_mean_merge_cutoff_dist, 172 Uint1 arg_merge_unit_step, 173 const string & arg_trigger, 174 Uint1 tmin_count, 175 bool arg_discontig, 176 Uint4 arg_pattern, 177 bool arg_use_ba ); 178 179 /** 180 **\brief Object destructor. 181 ** 182 **/ 183 ~CSeqMasker(); 184 185 /** 186 **\brief Sequence masking operator. 187 ** 188 ** seq_masker objects are function objects with. Main 189 ** processing is done by () operator. 190 ** 191 **\param data the original sequence data in iupacna format 192 **\return pointer to the list of masked intervals 193 ** 194 **/ 195 TMaskList * operator()( const objects::CSeqVector & data ) const; 196 197 private: 198 199 /**\internal 200 **\brief Internal representation of a sequence interval. 201 **/ 202 struct mitem 203 { 204 Uint4 start; /**< Start of the interval */ 205 Uint4 end; /**< End of the interval */ 206 double avg; /**< Average score of the units in the interval */ 207 208 /** 209 **\brief Object constructor. 210 ** 211 ** All the additional parameters are used by the constructor to compute 212 ** the value of avg. 213 ** 214 **\param start the start of the interval 215 **\param end the end of the interval 216 **\param unit_size the unit size in bases 217 **\param data the original sequence data in iupacna format 218 **\param owner back pointer to the seq_masker instance 219 ** 220 **/ 221 mitem( Uint4 start, Uint4 end, Uint1 unit_size, 222 const objects::CSeqVector & data, const CSeqMasker & owner ); 223 }; 224 225 friend struct CSeqMasker::mitem; 226 227 /**\internal 228 **\brief Type used for storing intermediate masked and unmasked intervals. 229 **/ 230 typedef list< mitem > TMList; 231 232 /** \internal 233 \brief Final masking pass with lookups of the actual Nmer scores. 234 \param data the sequence data 235 \param start start masking at this location 236 \param end stop masking at this location 237 \return container with masked intervals 238 */ 239 TMaskList * DoMask( const objects::CSeqVector & data, 240 TSeqPos start, TSeqPos end ) const; 241 242 /**\internal 243 **\brief Computes the average score of an interval generated by 244 ** connecting two neighbouring masked intervals. 245 ** 246 **\param mi points to the first masked interval 247 **\param umi points to the right unmasked neighbour of mi 248 **\param unit_size the unit size to use in computations 249 **\return the average score of an interval formed by 250 ** mi, umi, and mi+1 251 ** 252 **/ 253 double MergeAvg( TMList::iterator mi, const TMList::iterator & umi, 254 Uint4 unit_size ) const; 255 256 /**\internal 257 **\brief Merge two neighbouring masked intervals. 258 ** 259 ** Merges intervals mi and mi+1 into one with average of the 260 ** triple mi,umi,mi+1. Removes mi mi+1 from m and substitues 261 ** mi with the merged interval. Removes umi from um. 262 ** 263 **\param m list of intervals containing mi 264 **\param mi points to the first masked interval in the pair 265 ** that is being merged 266 **\param um list of intervals containing umi 267 **\param umi points to the right unmasked neighbour of mi 268 ** 269 **/ 270 void Merge( TMList & m, TMList::iterator mi, 271 TMList & um, TMList::iterator & umi ) const; 272 273 /**\internal 274 **\brief Container of the unit score statistics. 275 **/ 276 CRef< CSeqMaskerIstat > ustat; 277 278 /**\internal 279 **\brief Score function object to use for extensions. 280 **/ 281 CSeqMaskerScore * score; 282 283 /**\internal 284 **\brief Score function object to use for merging. 285 **/ 286 CSeqMaskerScore * score_p3; 287 288 /**\internal 289 **\brief Score function object to use for triggering masking. 290 **/ 291 CSeqMaskerScore * trigger_score; 292 293 /**\internal 294 **\brief The window size in bases. 295 **/ 296 Uint1 window_size; 297 298 /**\internal 299 **\brief The window step. 300 ** 301 ** Only windows that start at 0 mod window_step will be considered. 302 ** 303 **/ 304 Uint4 window_step; 305 306 /**\internal 307 **\brief The unit step. 308 ** 309 ** The distance between consequtive units within a window. 310 ** 311 **/ 312 Uint1 unit_step; 313 314 /**\internal 315 **\brief Flag indicating whether the merging pass is required. 316 **/ 317 bool merge_pass; 318 319 /**\internal 320 **\brief Average score that triggers merging of neighbouring 321 ** masked intervals. 322 **/ 323 Uint4 merge_cutoff_score; 324 325 /**\internal 326 **\brief Neighbouring masked intervals that closer to each other 327 ** than this distance are merged unconditionally. 328 **/ 329 Uint4 abs_merge_cutoff_dist; 330 331 /**\internal 332 **\brief Neighbouring masked intervals that are farther apart from 333 ** each other than this distance are never merged. 334 **/ 335 Uint4 mean_merge_cutoff_dist; 336 337 /**\internal 338 **\brief Unit step to use for interval merging. 339 ** 340 ** This is the unit step value that should be used when 341 ** computing the unit score average over the total span of 342 ** two intervals that are candidates for merging. 343 ** 344 **/ 345 Uint1 merge_unit_step; 346 347 /**\internal 348 **\brief Symbolic names for different masking triggering methods. 349 **/ 350 enum 351 { 352 eTrigger_Mean = 0, /**< Using mean of unit scores in the window. */ 353 eTrigger_Min /**< Using min score of k unit in the window. */ 354 } trigger; 355 356 /**\internal 357 **\brief Flag indicating the use of discontiguous units. 358 **/ 359 bool discontig; 360 361 /**\internal 362 **\brief Base pattern to form discontiguous units. 363 **/ 364 Uint4 pattern; 365 }; 366 367 END_NCBI_SCOPE 368 369 #endif 370