1 /*  $Id: symdust.hpp 486434 2015-12-04 14:20:02Z fongah2 $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author:  Aleksandr Morgulis
27  *
28  * File Description:
29  *   Header file for CSymDustMasker class.
30  *
31  */
32 
33 #ifndef C_SYM_DUST_MASKER_HPP
34 #define C_SYM_DUST_MASKER_HPP
35 
36 #include <corelib/ncbitype.h>
37 #include <corelib/ncbistr.hpp>
38 #include <corelib/ncbiobj.hpp>
39 
40 #include <objects/seqloc/Seq_loc.hpp>
41 #include <objects/seqloc/Packed_seqint.hpp>
42 #include <objmgr/seq_vector.hpp>
43 #include <util/random_gen.hpp>
44 
45 #include <iostream>
46 #include <vector>
47 #include <algorithm>
48 #include <iterator>
49 #include <memory>
50 #include <deque>
51 #include <list>
52 
53 
54 BEGIN_NCBI_SCOPE
55 
56 /**
57     \brief Looks for low complexity parts of sequences according to the
58            symmetric version of DUST algorithm.
59  */
60 class NCBI_XALGODUSTMASK_EXPORT CSymDustMasker
61 {
62     private:
63 
64         /** \internal
65             \brief Function class responsible for conversion from IUPACNA
66                    to NCBI2NA coding.
67          */
68         struct CIupac2Ncbi2na_converter
69         {
70             /** \internal
71                 \brief Operator performing the actual conversion.
72                 \param r base letter in IOPACNA encoding
73                 \return the same letter in NCBI2NA encoding
74              */
operator ()CSymDustMasker::CIupac2Ncbi2na_converter75             Uint1 operator()( Uint1 r )
76             {
77                 switch( r )
78                 {
79                     case 67: return 1;
80                     case 71: return 2;
81                     case 84: return 3;
82                     case 78: return (m_Random.GetRand() & 0x3);
83                     default: return 0;
84                 }
85             }
86             CRandom m_Random;
87         };
88 
89         typedef objects::CSeqVector seq_t;          /**<\internal Sequence type. */
90         typedef CIupac2Ncbi2na_converter convert_t; /**<\internal Converter type. */
91 
92     public:
93 
94         /**\brief Public sequence type. */
95         typedef seq_t sequence_type;
96         /**\brief Integer size type corresponding to sequence_type. */
97         typedef sequence_type::size_type size_type;
98         /**\brief Type respresenting an interval selected for masking. */
99         typedef std::pair< size_type, size_type > TMaskedInterval;
100         /**\brief Type representing a list of masked intervals. */
101         typedef std::vector< TMaskedInterval > TMaskList;
102 
103         static const Uint4 DEFAULT_LEVEL  = 20; /**< Default value of score threshold. */
104         static const Uint4 DEFAULT_WINDOW = 64; /**< Default window size. */
105         static const Uint4 DEFAULT_LINKER = 1;  /**< Default value of the longest distance between
106                                                      consequtive masked intervals at which they
107                                                      should be merged. */
108 
109         // These (up to constructor) are public to work around the bug in SUN C++ compiler.
110 
111         /** \internal
112             \brief Type representing a perfect interval.
113          */
114         struct perfect
115         {
116             TMaskedInterval bounds_;    /**<\internal The actual interval. */
117             Uint4 score_;               /**<\internal The score of the interval. */
118             size_type len_;             /**<\internal The length of the interval. */
119 
120             /** \internal
121                 \brief Object constructor.
122                 \param start position of the left end
123                 \param stop position of the right end
124                 \param score the score
125                 \param len the length
126              */
perfectCSymDustMasker::perfect127             perfect( size_type start, size_type stop, Uint4 score, size_type len )
128                 : bounds_( start, stop ), score_( score ), len_( len )
129             {}
130         };
131 
132         /**\brief Type representing a list of perfect intervals. */
133         typedef std::list< perfect > perfect_list_type;
134         /**\brief Table type to store score sum thresholds for each window length. */
135         typedef std::vector< Uint4 > thres_table_type;
136         /**\brief Type representing a triplet value. */
137         typedef Uint1 triplet_type;
138 
139         /**\brief Selects the significant bits in triplet_type. */
140         static const triplet_type TRIPLET_MASK = 0x3F;
141 
142         /**
143             \brief Object constructor.
144             \param level score threshold
145             \param window max window size
146             \param linker max distance at which to merge consequtive masked intervals
147          */
148         CSymDustMasker( Uint4 level = DEFAULT_LEVEL,
149                         size_type window
150                             = static_cast< size_type >( DEFAULT_WINDOW ),
151                         size_type linker
152                             = static_cast< size_type >( DEFAULT_LINKER ) );
153 
154         /**
155             \brief Mask a sequence.
156             \param seq a sequence to mask
157             \return list of masked intervals
158          */
159         std::auto_ptr< TMaskList > operator()( const sequence_type & seq );
160 
161         /**
162             \brief Mask a part of the sequence.
163             \param seq the sequence to mask
164             \param start beginning position of the subsequence to mask
165             \param stop ending position of the subsequence to mask
166             \return list of masked intervals
167          */
168         std::auto_ptr< TMaskList > operator()( const sequence_type & seq,
169                                                size_type start, size_type stop );
170 
171         /**
172             \brief Mask a sequence and return result as a sequence of CSeq_loc
173                    objects.
174             \param seq_id sequence id
175             \param seq the sequence
176             \param [out] vector of const (smart) references to CSeq_loc
177          */
178         void GetMaskedLocs(
179             objects::CSeq_id & seq_id,
180             const sequence_type & seq,
181             std::vector< CConstRef< objects::CSeq_loc > > & locs );
182 
183         /**\brief Mask a sequence and return result as a CPacked_seqint
184                   instance.
185            \param seq_id sequence id
186            \param seq the sequence
187           */
188         CRef< objects::CPacked_seqint > GetMaskedInts(
189             objects::CSeq_id & seq_id, const sequence_type & seq );
190 
191     private:
192 
193         /**\internal Sequence iterator type. */
194         typedef sequence_type::const_iterator seq_citer_type;
195 
196         /** \internal
197             \brief Class representing the set of triplets in a window.
198          */
199         class triplets
200         {
201             public:
202 
203                 /** \internal
204                     \brief Object constructor.
205                     \param window max window size
206                     \param low_k max triplet multiplicity that guarantees that
207                                  the window score is not above the threshold
208                     \param perfect_list [in/out] current list of perfect intervals
209                     \param thresholds table of threshold values for each window size
210                  */
211                 triplets( size_type window,
212                           Uint1 low_k,
213                           perfect_list_type & perfect_list,
214                           thres_table_type & thresholds );
215 
start() const216                 size_type start() const { return start_; }  /**<\internal Get position of the first triplet. */
stop() const217                 size_type stop() const { return stop_; }    /**<\internal Get position of the last triplet. */
size() const218                 size_type size() const { return triplet_list_.size(); } /**<\internal Get the number of triplets. */
219 
220                 /** \internal
221                     \brief Update the list of perfect intervals with with suffixes
222                            of the current window.
223                  */
224                 void find_perfect();
225 
226                 /** \internal
227                     \brief Shift the window one base to the right using triplet
228                            value t.
229                     \param t the triplet value to add to the right end of the
230                              triplet list
231                     \return false, if the new window contains a single triplet value;
232                             true otherwise
233                 */
234                 bool shift_window( triplet_type t );
235 
236                 /** \internal
237                     \brief Shift a single triplet window using a triplet value t.
238                     \param t the triplet value to add to the right end of the
239                              triplet list
240                     \return false, if the new window contains a single triplet value;
241                             true otherwise
242                 */
243                 bool shift_high( triplet_type t );
244 
245                 /** \internal
246                     \brief Check the condition of Proposition 2 allowing
247                            to skip the window processing.
248                     \return true if the window requires suffix processing,
249                             false otherwise
250                 */
needs_processing() const251                 bool needs_processing() const
252                 {
253                   Uint4 count = stop_ - L;
254                   return count < triplet_list_.size() &&
255                          10*r_w > thresholds_[count];
256                 }
257 
258             private:
259 
260                 /**\internal Implementation type for triplets list. */
261                 typedef std::deque< triplet_type > impl_type;
262                 /**\internal Triplets list iterator type. */
263                 typedef impl_type::const_iterator impl_citer_type;
264                 /**\internal Type for triplet counts tables. */
265                 typedef Uint1 counts_type[64];
266 
267                 /** \internal
268                     \brief Recompute the value of the running sum
269                            and the triplet counts when a new triplet
270                            is added.
271                     \param r the running sum
272                     \param c the triplet counts
273                     \param t the new triplet value
274                 */
add_triplet_info(Uint4 & r,counts_type & c,triplet_type t)275                 void add_triplet_info(
276                         Uint4 & r, counts_type & c, triplet_type t )
277                 { r += c[t]; ++c[t]; }
278 
279                 /** \internal
280                     \brief Recompute the value of the running sum
281                            and the triplet counts when a triplet
282                            is removed.
283                     \param r the running sum
284                     \param c the triplet counts
285                     \param t the triplet value being removed
286                 */
rem_triplet_info(Uint4 & r,counts_type & c,triplet_type t)287                 void rem_triplet_info(
288                         Uint4 & r, counts_type & c, triplet_type t )
289                 { --c[t]; r -= c[t]; }
290 
291                 impl_type triplet_list_;            /**<\internal The triplet list. */
292 
293                 size_type start_;                   /**<\internal Position of the first triplet in the window. */
294                 size_type stop_;                    /**<\internal Position of the last triplet in the window. */
295                 size_type max_size_;                /**<\internal Maximum window size. */
296 
297                 Uint1 low_k_;                       /**<\internal Max triplet multiplicity that guarantees that
298                                                                   that the window score is not above the threshold. */
299                 Uint4 L;                            /**<\internal Position of the start of the window suffix
300                                                                   corresponding to low_k_. */
301 
302                 perfect_list_type & P;              /**<\internal Current list of perfect subintervals. */
303                 thres_table_type & thresholds_;     /**<\internal Table containing thresholds for each
304                                                                   value of window length. */
305 
306                 counts_type c_w;             /**<\internal Table of triplet counts for the whole window. */
307                 counts_type c_v;             /**<\internal Table of triplet counts for the window suffix. */
308                 Uint4 r_w;                   /**<\internal Running sum for the whole window. */
309                 Uint4 r_v;                   /**<\internal Running sum for the window suffix. */
310                 Uint4 num_diff;              /**<\internal Number of different triplets values. */
311         };
312 
313         /** \internal
314             \brief Merge perfect intervals into the result list.
315             \param res the result list
316             \param w the list of perfect intervals
317             \param start the start position of the subsequence
318         */
319         void save_masked_regions(
320                 TMaskList & res, size_type w, size_type start );
321 
322         Uint4 level_;       /**<\internal Score threshold. */
323         size_type window_;  /**<\internal Max window size. */
324         size_type linker_;  /**<\internal Max distance at which consequtive masked intervals should be merged. */
325 
326         Uint1 low_k_;   /**<\internal max triplet multiplicity guaranteeing not to exceed score threshold. */
327 
328         perfect_list_type P;            /**<\internal List of perfect intervals within current window. */
329         thres_table_type thresholds_;   /**<\internal Table containing score thresholds for each window size. */
330 
331         convert_t converter_;   /**\internal IUPACNA to NCBI2NA converter object. */
332 };
333 
334 END_NCBI_SCOPE
335 
336 #endif
337