1 /* $Id: seq_match.hpp 103491 2007-05-04 17:18:18Z kazimird $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Josh Cherry
27 *
28 * File Description: Simple pattern matching for sequences
29 *
30 */
31
32
33 #ifndef GUI_CORE_ALGO_BASIC___SEQ_MATCH__HPP
34 #define GUI_CORE_ALGO_BASIC___SEQ_MATCH__HPP
35
36 #include <corelib/ncbistd.hpp>
37
38 BEGIN_NCBI_SCOPE
39
40 ///
41 /// This class provides functions for determining where
42 /// sequences, perhaps containing ambiguity codes,
43 /// must or can match patterns.
44 ///
45 /// The functions are highly templatized. The main
46 /// reason is to allow reuse of pattern-matching code with
47 /// different 'alphabets'. A second reason is to allow use
48 /// of different container classes for sequences and patterns,
49 /// e.g., the ncbi8na matching will work with both
50 /// string and vector<char>.
51 /// This would be much nicer with template template parameters,
52 /// but MSVC doesn't support them.
53 ///
54
55 class CSeqMatch
56 {
57 public:
58 enum EMatch {
59 eNo,
60 eYes,
61 eMaybe
62 };
63 /// determine whether ncbi8na base s is a match to q.
64 static EMatch CompareNcbi8na(char s, char q);
65 /// determine whether seq matches pattern pat starting at position pos.
66 ///
67 /// It is the caller's responsibility to ensure that there are
68 /// enough 'characters' in seq, i.e., that pos + pat.size() <= seq.size()
69 template<class Seq, class Pat, class Compare_fun>
Match(const Seq & seq,const Pat & pat,TSeqPos pos,const Compare_fun compare_fun)70 static EMatch Match(const Seq& seq, const Pat& pat, TSeqPos pos,
71 const Compare_fun compare_fun)
72 {
73
74 // for efficiency, no check is made that we're not looking
75 // past the end of seq; caller must assure this
76
77 EMatch rv = eYes;
78 // check pattern positions in succession
79 for (unsigned int i = 0; i < pat.size(); i++) {
80 EMatch res = compare_fun(seq[pos+i], pat[i]);
81 if (res == eNo) {
82 return eNo;
83 }
84 if (res == eMaybe) {
85 rv = eMaybe;
86 }
87 }
88 // if we got here, everybody at least could have matched
89 return rv;
90 }
91
92 template<class Seq, class Pat>
MatchNcbi8na(const Seq & seq,const Pat & pat,TSeqPos pos)93 static EMatch MatchNcbi8na(const Seq& seq,
94 const Pat& pat, TSeqPos pos)
95 {
96 return Match(seq, pat, pos, CompareNcbi8na);
97 }
98
99 template <class Seq, class Pat>
100 struct SMatchNcbi8na
101 {
operator ()CSeqMatch::SMatchNcbi8na102 EMatch operator() (const Seq& seq,
103 const Pat& pat, TSeqPos pos) const
104 {
105 return CSeqMatch::Match(seq, pat, pos, CompareNcbi8na);
106 }
107 };
108
109 /// find all places where seq must or might match pat
110 template<class Seq, class Pat, class Match_fun>
FindMatches(const Seq & seq,const Pat & pat,vector<TSeqPos> & definite_matches,vector<TSeqPos> & possible_matches,Match_fun match)111 static void FindMatches(const Seq& seq,
112 const Pat& pat,
113 vector<TSeqPos>& definite_matches,
114 vector<TSeqPos>& possible_matches,
115 Match_fun match)
116 {
117 for (unsigned int i = 0; i < seq.size() - pat.size() + 1; i++) {
118 EMatch res = match(seq, pat, i);
119 if (res == eNo) {
120 continue;
121 }
122 if (res == eYes) {
123 definite_matches.push_back(i);
124 continue;
125 }
126 // otherwise must be eMaybe
127 possible_matches.push_back(i);
128 }
129 }
130
131 template<class Seq, class Pat>
FindMatchesNcbi8na(const Seq & seq,const Pat & pat,vector<TSeqPos> & definite_matches,vector<TSeqPos> & possible_matches)132 static void FindMatchesNcbi8na(const Seq& seq,
133 const Pat& pat,
134 vector<TSeqPos>& definite_matches,
135 vector<TSeqPos>& possible_matches)
136 {
137 FindMatches(seq, pat,
138 definite_matches, possible_matches,
139 SMatchNcbi8na<Seq, Pat>());
140 }
141
142 /// stuff for dealing with ncbi8na.
143 /// doesn't really belong here, but oh well
144
145 /// convert a single base from IUPAC to ncbi8na
146 NCBI_XALGOSEQ_EXPORT static char IupacToNcbi8na(char in);
147 /// convert a whole string from IUPAC to ncbi8na
148 NCBI_XALGOSEQ_EXPORT static void IupacToNcbi8na(const string& in, string& out);
149 /// complement an ncbi8na sequence in place
150 NCBI_XALGOSEQ_EXPORT static void CompNcbi8na(string& seq8na);
151 /// complement a single ncbi8na base
152 NCBI_XALGOSEQ_EXPORT static char CompNcbi8na(char);
153 };
154
155
156
157
158
159 // works on ncbi8na
160 // s can match q iff they have some set bits in common
161 // s must match q iff it represents a subset,
162 // i.e., if no bits set in s are unset in q
163 inline
CompareNcbi8na(char s,char q)164 CSeqMatch::EMatch CSeqMatch::CompareNcbi8na(char s, char q)
165 {
166 if (!(s & q)) {
167 // nothing in common
168 return eNo;
169 }
170 if (s & ~q) {
171 return eMaybe;
172 }
173 return eYes;
174 }
175
176
177 END_NCBI_SCOPE
178
179 #endif // GUI_CORE_ALGO_BASIC___SEQ_MATCH__HPP
180