1 /*  $Id: seq_masker.hpp 575325 2018-11-27 18:22:00Z ucko $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author:  Aleksandr Morgulis
27  *
28  * File Description:
29  *   Header file for CSeqMasker class.
30  *
31  */
32 
33 #ifndef C_SEQ_MASKER_H
34 #define C_SEQ_MASKER_H
35 
36 #include <corelib/ncbitype.h>
37 #include <corelib/ncbistr.hpp>
38 #include <corelib/ncbiobj.hpp>
39 
40 #include <algo/winmask/seq_masker_window.hpp>
41 #include <algo/winmask/seq_masker_istat.hpp>
42 #include <algo/winmask/seq_masker_version.hpp>
43 
44 BEGIN_NCBI_SCOPE
45 
46 class CSeqMaskerScore;
47 
48 /**
49  **\brief Main interface to window based masker functionality.
50  **
51  **/
52 class NCBI_XALGOWINMASK_EXPORT CSeqMasker
53 {
54 public:
55 
56     /// Version of window masking algorithm.
57     static CSeqMaskerVersion AlgoVersion;
58 
59     /**
60      **\brief Type representing a masked interval within a sequence.
61      **
62      ** If A is an object of type TMaskedInterval, then A.first is
63      ** the offset (starting from 0) of the beginning of the
64      ** interval; A.second is the offset of the end of the interval.
65      **
66      **/
67     typedef pair< TSeqPos, TSeqPos > TMaskedInterval;
68 
69     /**
70      **\brief A type representing the total of masking information
71      **       about a sequence.
72      **
73      **/
74     typedef vector< TMaskedInterval > TMaskList;
75 
76     /**
77      **\brief Represents different error situations that can occur
78      **       in the masking process.
79      **/
80     class CSeqMaskerException : public CException
81     {
82     public:
83 
84         /**
85          **\brief Integer error codes.
86          **/
87         enum EErrCode
88         {
89             eLstatStreamIpenFail,   /**< Error opening the length statistics file */
90             eLstatSyntax,           /**< Error parsing the length statistics file */
91             eLstatParam,            /**< Error deducing parameters from lstat or command line */
92             eScoreAllocFail,        /**< Error allocating the score function object */
93             eScoreP3AllocFail,      /**< Error allocating the score function object for merging pass */
94             eValidation             /**< Insconsistent internal parameters */
95         };
96 
97         /**
98          **\brief Get the exception description string.
99          **
100          ** The method translates internal error code in the exception
101          ** object into a human readable explanation string.
102          **
103          **\return explanation string for the exception
104          **
105          **/
106         virtual const char * GetErrCodeString() const override;
107 
108         NCBI_EXCEPTION_DEFAULT( CSeqMaskerException, CException );
109     };
110 
111     /**
112      **\brief Merge together two result lists.
113      **
114      ** Used to merge results lists obtained from winmask and dust
115      ** algorithms.
116      **
117      **\param dest this list will contain the merged data
118      **\param src the other results list
119      **/
120     static void MergeMaskInfo( TMaskList * dest, const TMaskList * src );
121 
122     /**
123      **\brief Object constructor.
124      **
125      ** Parameters to the constructor determine the behaviour of the
126      ** window based masking procedure.
127      **
128      **\param lstat_name the name of the file containing length statistics
129      **\param arg_window_size the window size in bps
130      **\param arg_window_step the window step
131      **\param arg_unit_step the unit step
132      **\param arg_textend the score above which it is allowed to keep masking
133      **\param arg_cutoff_score the unit score triggering the masking
134      **\param arg_max_score maximum allowed unit score
135      **\param arg_min_score minimum allowed unit score
136      **\param arg_set_max_score score to use for units exceeding max_score
137      **\param arg_set_min_score score to use for units below min_score
138      **\param arg_merge_pass whether or not to perform an interval merging pass
139      **\param arg_merge_cutoff_score combined average score at which intervals
140      **                              should be merged
141      **\param arg_abs_merge_cutoff_dist maximum distance between intervals
142      **                                 at which they can be merged
143      **                                 unconditionally
144      **\param arg_mean_merge_cutoff_dist maximum distance between intervals
145      **                                  at which they can be merged if they
146      **                                  satisfy arg_merge_cutoff_score
147      **                                  threshold
148      **\param arg_merge_unit_step unit step to use for interval merging
149      **\param arg_trigger determines which method to use to trigger masking
150      **\param tmin_count if arg_trigger is "min" then determines how many of
151      **                  the units in a window should be above the score
152      **                  threshold in order to trigger masking
153      **\param arg_discontig whether or not to use discontiguous units
154      **\param arg_pattern base pattern to form discontiguous units
155      **\param arg_use_ba use bit array optimization, if available
156      **
157      **/
158     CSeqMasker( const string & lstat_name,
159                 Uint1 arg_window_size,
160                 Uint4 arg_window_step,
161                 Uint1 arg_unit_step,
162                 Uint4 arg_textend,
163                 Uint4 arg_cutoff_score,
164                 Uint4 arg_max_score,
165                 Uint4 arg_min_score,
166                 Uint4 arg_set_max_score,
167                 Uint4 arg_set_min_score,
168                 bool arg_merge_pass,
169                 Uint4 arg_merge_cutoff_score,
170                 Uint4 arg_abs_merge_cutoff_dist,
171                 Uint4 arg_mean_merge_cutoff_dist,
172                 Uint1 arg_merge_unit_step,
173                 const string & arg_trigger,
174                 Uint1 tmin_count,
175                 bool arg_discontig,
176                 Uint4 arg_pattern,
177                 bool arg_use_ba );
178 
179     /**
180      **\brief Object destructor.
181      **
182      **/
183     ~CSeqMasker();
184 
185     /**
186      **\brief Sequence masking operator.
187      **
188      ** seq_masker objects are function objects with. Main
189      ** processing is done by () operator.
190      **
191      **\param data the original sequence data in iupacna format
192      **\return pointer to the list of masked intervals
193      **
194      **/
195     TMaskList * operator()( const objects::CSeqVector & data ) const;
196 
197 private:
198 
199     /**\internal
200      **\brief Internal representation of a sequence interval.
201      **/
202     struct mitem
203     {
204         Uint4 start;    /**< Start of the interval */
205         Uint4 end;  /**< End of the interval */
206         double avg; /**< Average score of the units in the interval */
207 
208         /**
209          **\brief Object constructor.
210          **
211          ** All the additional parameters are used by the constructor to compute
212          ** the value of avg.
213          **
214          **\param start the start of the interval
215          **\param end the end of the interval
216          **\param unit_size the unit size in bases
217          **\param data the original sequence data in iupacna format
218          **\param owner back pointer to the seq_masker instance
219          **
220          **/
221         mitem( Uint4 start, Uint4 end, Uint1 unit_size,
222                const objects::CSeqVector & data, const CSeqMasker & owner );
223     };
224 
225     friend struct CSeqMasker::mitem;
226 
227     /**\internal
228      **\brief Type used for storing intermediate masked and unmasked intervals.
229      **/
230     typedef list< mitem > TMList;
231 
232     /** \internal
233         \brief Final masking pass with lookups of the actual Nmer scores.
234         \param data the sequence data
235         \param start start masking at this location
236         \param end stop masking at this location
237         \return container with masked intervals
238      */
239     TMaskList * DoMask( const objects::CSeqVector & data,
240                         TSeqPos start, TSeqPos end ) const;
241 
242     /**\internal
243      **\brief Computes the average score of an interval generated by
244      **       connecting two neighbouring masked intervals.
245      **
246      **\param mi points to the first masked interval
247      **\param umi points to the right unmasked neighbour of mi
248      **\param unit_size the unit size to use in computations
249      **\return the average score of an interval formed by
250      **        mi, umi, and mi+1
251      **
252      **/
253     double MergeAvg( TMList::iterator mi, const TMList::iterator & umi,
254                      Uint4 unit_size ) const;
255 
256     /**\internal
257      **\brief Merge two neighbouring masked intervals.
258      **
259      ** Merges intervals mi and mi+1 into one with average of the
260      ** triple mi,umi,mi+1. Removes mi mi+1 from m and substitues
261      ** mi with the merged interval. Removes umi from um.
262      **
263      **\param m list of intervals containing mi
264      **\param mi points to the first masked interval in the pair
265      **          that is being merged
266      **\param um list of intervals containing umi
267      **\param umi points to the right unmasked neighbour of mi
268      **
269      **/
270     void Merge( TMList & m, TMList::iterator mi,
271                 TMList & um, TMList::iterator & umi ) const;
272 
273     /**\internal
274      **\brief Container of the unit score statistics.
275      **/
276     CRef< CSeqMaskerIstat > ustat;
277 
278     /**\internal
279      **\brief Score function object to use for extensions.
280      **/
281     CSeqMaskerScore * score;
282 
283     /**\internal
284      **\brief Score function object to use for merging.
285      **/
286     CSeqMaskerScore * score_p3;
287 
288     /**\internal
289      **\brief Score function object to use for triggering masking.
290      **/
291     CSeqMaskerScore * trigger_score;
292 
293     /**\internal
294      **\brief The window size in bases.
295      **/
296     Uint1 window_size;
297 
298     /**\internal
299      **\brief The window step.
300      **
301      ** Only windows that start at 0 mod window_step will be considered.
302      **
303      **/
304     Uint4 window_step;
305 
306     /**\internal
307      **\brief The unit step.
308      **
309      ** The distance between consequtive units within a window.
310      **
311      **/
312     Uint1 unit_step;
313 
314     /**\internal
315      **\brief Flag indicating whether the merging pass is required.
316      **/
317     bool merge_pass;
318 
319     /**\internal
320      **\brief Average score that triggers merging of neighbouring
321      **       masked intervals.
322      **/
323     Uint4 merge_cutoff_score;
324 
325     /**\internal
326      **\brief Neighbouring masked intervals that closer to each other
327      **       than this distance are merged unconditionally.
328      **/
329     Uint4 abs_merge_cutoff_dist;
330 
331     /**\internal
332      **\brief Neighbouring masked intervals that are farther apart from
333      **       each other than this distance are never merged.
334      **/
335     Uint4 mean_merge_cutoff_dist;
336 
337     /**\internal
338      **\brief Unit step to use for interval merging.
339      **
340      ** This is the unit step value that should be used when
341      ** computing the unit score average over the total span of
342      ** two intervals that are candidates for merging.
343      **
344      **/
345     Uint1 merge_unit_step;
346 
347     /**\internal
348      **\brief Symbolic names for different masking triggering methods.
349      **/
350     enum
351     {
352         eTrigger_Mean = 0,  /**< Using mean of unit scores in the window. */
353         eTrigger_Min        /**< Using min score of k unit in the window. */
354     } trigger;
355 
356     /**\internal
357      **\brief Flag indicating the use of discontiguous units.
358      **/
359     bool discontig;
360 
361     /**\internal
362      **\brief Base pattern to form discontiguous units.
363      **/
364     Uint4 pattern;
365 };
366 
367 END_NCBI_SCOPE
368 
369 #endif
370