1 /*  $Id: seq_match.hpp 103491 2007-05-04 17:18:18Z kazimird $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors:  Josh Cherry
27  *
28  * File Description:  Simple pattern matching for sequences
29  *
30  */
31 
32 
33 #ifndef GUI_CORE_ALGO_BASIC___SEQ_MATCH__HPP
34 #define GUI_CORE_ALGO_BASIC___SEQ_MATCH__HPP
35 
36 #include <corelib/ncbistd.hpp>
37 
38 BEGIN_NCBI_SCOPE
39 
40 ///
41 /// This class provides functions for determining where
42 /// sequences, perhaps containing ambiguity codes,
43 /// must or can match patterns.
44 ///
45 /// The functions are highly templatized.  The main
46 /// reason is to allow reuse of pattern-matching code with
47 /// different 'alphabets'.  A second reason is to allow use
48 /// of different container classes for sequences and patterns,
49 /// e.g., the ncbi8na matching will work with both
50 /// string and vector<char>.
51 /// This would be much nicer with template template parameters,
52 /// but MSVC doesn't support them.
53 ///
54 
55 class CSeqMatch
56 {
57 public:
58     enum EMatch {
59         eNo,
60         eYes,
61         eMaybe
62     };
63     /// determine whether ncbi8na base s is a match to q.
64     static EMatch CompareNcbi8na(char s, char q);
65     /// determine whether seq matches pattern pat starting at position pos.
66     ///
67     /// It is the caller's responsibility to ensure that there are
68     /// enough 'characters' in seq, i.e., that pos + pat.size() <= seq.size()
69     template<class Seq, class Pat, class Compare_fun>
Match(const Seq & seq,const Pat & pat,TSeqPos pos,const Compare_fun compare_fun)70     static EMatch Match(const Seq& seq, const Pat& pat, TSeqPos pos,
71                         const Compare_fun compare_fun)
72     {
73 
74         // for efficiency, no check is made that we're not looking
75         // past the end of seq; caller must assure this
76 
77         EMatch rv = eYes;
78         // check pattern positions in succession
79         for (unsigned int i = 0;  i < pat.size();  i++) {
80             EMatch res = compare_fun(seq[pos+i], pat[i]);
81             if (res == eNo) {
82                 return eNo;
83             }
84             if (res == eMaybe) {
85                 rv = eMaybe;
86             }
87         }
88         // if we got here, everybody at least could have matched
89         return rv;
90     }
91 
92     template<class Seq, class Pat>
MatchNcbi8na(const Seq & seq,const Pat & pat,TSeqPos pos)93     static EMatch MatchNcbi8na(const Seq& seq,
94                                const Pat& pat, TSeqPos pos)
95     {
96         return Match(seq, pat, pos, CompareNcbi8na);
97     }
98 
99     template <class Seq, class Pat>
100     struct SMatchNcbi8na
101     {
operator ()CSeqMatch::SMatchNcbi8na102         EMatch operator() (const Seq& seq,
103                            const Pat& pat, TSeqPos pos) const
104         {
105             return CSeqMatch::Match(seq, pat, pos, CompareNcbi8na);
106         }
107     };
108 
109     /// find all places where seq must or might match pat
110     template<class Seq, class Pat, class Match_fun>
FindMatches(const Seq & seq,const Pat & pat,vector<TSeqPos> & definite_matches,vector<TSeqPos> & possible_matches,Match_fun match)111     static void FindMatches(const Seq& seq,
112                             const Pat& pat,
113                             vector<TSeqPos>& definite_matches,
114                             vector<TSeqPos>& possible_matches,
115                             Match_fun match)
116     {
117         for (unsigned int i = 0;  i < seq.size() - pat.size() + 1; i++) {
118             EMatch res = match(seq, pat, i);
119             if (res == eNo) {
120                 continue;
121             }
122             if (res == eYes) {
123                 definite_matches.push_back(i);
124                 continue;
125             }
126             // otherwise must be eMaybe
127             possible_matches.push_back(i);
128         }
129     }
130 
131     template<class Seq, class Pat>
FindMatchesNcbi8na(const Seq & seq,const Pat & pat,vector<TSeqPos> & definite_matches,vector<TSeqPos> & possible_matches)132     static void FindMatchesNcbi8na(const Seq& seq,
133                                    const Pat& pat,
134                                    vector<TSeqPos>& definite_matches,
135                                    vector<TSeqPos>& possible_matches)
136     {
137         FindMatches(seq, pat,
138                     definite_matches, possible_matches,
139                     SMatchNcbi8na<Seq, Pat>());
140     }
141 
142     /// stuff for dealing with ncbi8na.
143     /// doesn't really belong here, but oh well
144 
145     /// convert a single base from IUPAC to ncbi8na
146     NCBI_XALGOSEQ_EXPORT static char IupacToNcbi8na(char in);
147     /// convert a whole string from IUPAC to ncbi8na
148     NCBI_XALGOSEQ_EXPORT static void IupacToNcbi8na(const string& in, string& out);
149     /// complement an ncbi8na sequence in place
150     NCBI_XALGOSEQ_EXPORT static void CompNcbi8na(string& seq8na);
151     /// complement a single ncbi8na base
152     NCBI_XALGOSEQ_EXPORT static char CompNcbi8na(char);
153 };
154 
155 
156 
157 
158 
159 // works on ncbi8na
160 // s can match q iff they have some set bits in common
161 // s must match q iff it represents a subset,
162 // i.e., if no bits set in s are unset in q
163 inline
CompareNcbi8na(char s,char q)164 CSeqMatch::EMatch CSeqMatch::CompareNcbi8na(char s, char q)
165 {
166     if (!(s & q)) {
167         // nothing in common
168         return eNo;
169     }
170     if (s & ~q) {
171         return eMaybe;
172     }
173     return eYes;
174 }
175 
176 
177 END_NCBI_SCOPE
178 
179 #endif   // GUI_CORE_ALGO_BASIC___SEQ_MATCH__HPP
180