1 #ifndef UTIL___MULTIPATTERN_SEARCH__HPP
2 #define UTIL___MULTIPATTERN_SEARCH__HPP
3 
4 /*  $Id: multipattern_search.hpp 568752 2018-08-09 20:30:17Z kachalos $
5 * ===========================================================================
6 *
7 *                            PUBLIC DOMAIN NOTICE
8 *               National Center for Biotechnology Information
9 *
10 *  This software/database is a "United States Government Work" under the
11 *  terms of the United States Copyright Act.  It was written as part of
12 *  the author's official duties as a United States Government employee and
13 *  thus cannot be copyrighted.  This software/database is freely available
14 *  to the public for use. The National Library of Medicine and the U.S.
15 *  Government have not placed any restriction on its use or reproduction.
16 *
17 *  Although all reasonable efforts have been taken to ensure the accuracy
18 *  and reliability of the software and data, the NLM and the U.S.
19 *  Government do not and cannot warrant the performance or results that
20 *  may be obtained by using this software or data. The NLM and the U.S.
21 *  Government disclaim all warranties, express or implied, including
22 *  warranties of performance, merchantability or fitness for any particular
23 *  purpose.
24 *
25 *  Please cite the author in any work or product based on this material.
26 *
27 * ===========================================================================
28 *
29 * Author: Sema Kachalo, NCBI
30 *
31 */
32 
33 /// @file multipattern_search.hpp
34 /// Simultaneous search of multiple RegEx patterns in the input string
35 
36 #include <corelib/ncbistd.hpp>
37 #include <functional>
38 
39 BEGIN_NCBI_SCOPE
40 
41 class CRegExFSA;
42 
43 
44 ///////////////////////////////////////////////////////////////////////
45 ///
46 /// CMultipatternSearch
47 ///
48 /// Simultaneous search of multiple string or RegEx patterns in the input string
49 ///
50 /// CMultipatternSearch builds a Finite State Machine (FSM)
51 /// from a list of search strings or regular expression patterns.
52 /// It can then search all patterns simultaneously,
53 /// requiring only a single traversal of the input string.
54 /// Use this class to increase the search performance
55 /// when the number of search patterns is large (10 and more)
56 /// If the patterns are known in advance, FSM can be exported as C code
57 /// and compiled for the further performance improvement.
58 
59 class NCBI_XUTIL_EXPORT CMultipatternSearch
60 {
61 public:
62     CMultipatternSearch();
63     ~CMultipatternSearch();
64 
65     /// Search flags (for non-RegEx patterns only!)
66     enum EFlags {
67         fNoCase      = 1 << 0,
68         fBeginString = 1 << 1,
69         fEndString   = 1 << 2,
70         fWholeString = fBeginString | fEndString,
71         fBeginWord   = 1 << 3,
72         fEndWord     = 1 << 4,
73         fWholeWord   = fBeginWord | fEndWord
74     };
75     DECLARE_SAFE_FLAGS_TYPE(EFlags, TFlags);
76 
77 
78     ///@{
79     /// Add search pattern to the FSM
80     ///
81     /// @param pattern
82     ///   A search pattern to add to the FSM.
83     ///   If begins with a slash (/), then it is considered a RegEx.
84     /// @param flags
85     ///   Additional search conditions.
86     ///   If the first argument is a RegEx, then the flags are ignored.
87     void AddPattern(const char*   pattern, TFlags flags = 0);
AddPattern(const string & pattern,TFlags flags=0)88     void AddPattern(const string& pattern, TFlags flags = 0) { AddPattern(pattern.c_str(), flags); }
89     void AddPatterns(const vector<string>& patterns);
90     void AddPatterns(const vector<pair<string, TFlags>>& patterns);
91     ///@}
92 
93     /// Quote special characters to insert string into regular expression
94     static string QuoteString(const string& str);
95 
96     /// Generate graphical representation of the FSM in DOT format.
97     /// For more details, see http://www.graphviz.org/
98     ///
99     /// @param out
100     ///   A stream to receive the output.
101     void GenerateDotGraph(ostream& out) const;
102 
103     /// Generate C++ array/map data.
104     ///
105     /// @param out
106     ///   A stream to receive the output.
107     void GenerateArrayMapData(ostream& out) const;
108 
109     /// Generate C code for the FSM search.
110     ///
111     /// @param out
112     ///   A stream to receive the output.
113     void GenerateSourceCode(ostream& out) const;
114 
115     /// When the pattern is found, the search can be stopped or continued
116     enum EOnFind {
117         eStopSearch,
118         eContinueSearch
119     };
120 
121     ///@{
122     /// Run the FSM search on the input string
123     ///
124     /// @param input
125     ///   Input string.
126     /// @param found_callback
127     ///   Function or function-like object to call when the pattern is found.
128     ///   It can accept one or two parameters and return void or EOnFind.
129     ///   If it returns eStopSearch, the search terminates.
130     ///   If it returns eContinueSearch or void, the search continues.
131     /// @sa CFoundCallback
132     ///
133     /// @code
134     ///  Search(str, [](size_t pattern) { cout << "Found " << pattern << "\n"; });
135     ///  Search(str, [](size_t pattern) -> CMultipatternSearch::EOnFind { cout << "Found " << pattern << "\n";  return CMultipatternSearch::eContinueSearch; });
136     ///  Search(str, [](size_t pattern, size_t position) { cout << "Found " << pattern << " " << position << "\n"; });
137     ///  Search(str, [](size_t pattern, size_t position) -> CMultipatternSearch::EOnFind { cout << "Found " << pattern << " " << position << "\n";  return CMultipatternSearch::eContinueSearch; });
138     /// @endcode
139 
140     typedef std::function<void(size_t)> VoidCall1;
141     typedef std::function<void(size_t, size_t)> VoidCall2;
142     typedef std::function<bool(size_t)> BoolCall1;
143     typedef std::function<bool(size_t, size_t)> BoolCall2;
144 
145     void Search(const char*   input, VoidCall1 found_callback) const;
Search(const string & input,VoidCall1 found_callback) const146     void Search(const string& input, VoidCall1 found_callback) const { Search(input.c_str(), found_callback); }
147     void Search(const char*   input, VoidCall2 found_callback) const;
Search(const string & input,VoidCall2 found_callback) const148     void Search(const string& input, VoidCall2 found_callback) const { Search(input.c_str(), found_callback); }
149     void Search(const char*   input, BoolCall1 found_callback) const;
Search(const string & input,BoolCall1 found_callback) const150     void Search(const string& input, BoolCall1 found_callback) const { Search(input.c_str(), found_callback); }
151     void Search(const char*   input, BoolCall2 found_callback) const;
Search(const string & input,BoolCall2 found_callback) const152     void Search(const string& input, BoolCall2 found_callback) const { Search(input.c_str(), found_callback); }
153     ///@}
154 
155     ///@{
156     /// Run the FSM search on the input string using the strucutre prebuilt by multipattern -A
157     ///
158     /// @param input
159     ///   Input string.
160     /// @param states
161     ///   generated by multipattern -A
162     /// @param emit
163     ///   generated by multipattern -A
164     /// @param hits
165     ///   generated by multipattern -A
166     /// @param found_callback
167     ///   Function or function-like object to call when the pattern is found.
168     ///   It can accept one or two parameters and return void or EOnFind.
169     ///   If it returns eStopSearch, the search terminates.
170     ///   If it returns eContinueSearch or void, the search continues.
171     /// @sa CFoundCallback
172     ///
173     /// @code
174     ///  Search(str, [](size_t pattern) { cout << "Found " << pattern << "\n"; });
175     ///  Search(str, [](size_t pattern) -> CMultipatternSearch::EOnFind { cout << "Found " << pattern << "\n";  return CMultipatternSearch::eContinueSearch; });
176     ///  Search(str, [](size_t pattern, size_t position) { cout << "Found " << pattern << " " << position << "\n"; });
177     ///  Search(str, [](size_t pattern, size_t position) -> CMultipatternSearch::EOnFind { cout << "Found " << pattern << " " << position << "\n";  return CMultipatternSearch::eContinueSearch; });
178     /// @endcode
179     ///
180     /// Example:
181     ///
182     /// static void Screen(const char* str, bool *result)
183     /// {
184     ///     #define _FSM_EMIT static bool emit[]
185     ///     #define _FSM_HITS static map<size_t, vector<size_t>> hits
186     ///     #define _FSM_STATES static size_t states[]
187     ///     #include "GENERATED_FILE.inc"
188     ///     #undef _FSM_EMIT
189     ///     #undef _FSM_HITS
190     ///     #undef _FSM_STATES
191     ///     CMultipatternSearch::Search(str, states, emit, hits, [result](size_t n) { result[n] = true; });
192     /// }
193 
194     static void Search(const char*   input, const size_t* states, const bool* emit, const map<size_t, vector<size_t>>& hits, VoidCall1 found_callback);
Search(const string & input,const size_t * states,const bool * emit,const map<size_t,vector<size_t>> & hits,VoidCall1 found_callback)195     static void Search(const string& input, const size_t* states, const bool* emit, const map<size_t, vector<size_t>>& hits, VoidCall1 found_callback) { Search(input.c_str(), states, emit, hits, found_callback); }
196     static void Search(const char*   input, const size_t* states, const bool* emit, const map<size_t, vector<size_t>>& hits, VoidCall2 found_callback);
Search(const string & input,const size_t * states,const bool * emit,const map<size_t,vector<size_t>> & hits,VoidCall2 found_callback)197     static void Search(const string& input, const size_t* states, const bool* emit, const map<size_t, vector<size_t>>& hits, VoidCall2 found_callback) { Search(input.c_str(), states, emit, hits, found_callback); }
198     static void Search(const char*   input, const size_t* states, const bool* emit, const map<size_t, vector<size_t>>& hits, BoolCall1 found_callback);
Search(const string & input,const size_t * states,const bool * emit,const map<size_t,vector<size_t>> & hits,BoolCall1 found_callback)199     static void Search(const string& input, const size_t* states, const bool* emit, const map<size_t, vector<size_t>>& hits, BoolCall1 found_callback) { Search(input.c_str(), states, emit, hits, found_callback); }
200     static void Search(const char*   input, const size_t* states, const bool* emit, const map<size_t, vector<size_t>>& hits, BoolCall2 found_callback);
Search(const string & input,const size_t * states,const bool * emit,const map<size_t,vector<size_t>> & hits,BoolCall2 found_callback)201     static void Search(const string& input, const size_t* states, const bool* emit, const map<size_t, vector<size_t>>& hits, BoolCall2 found_callback) { Search(input.c_str(), states, emit, hits, found_callback); }
202     /// @endcode
203 
204 private:
205     /// Finit State Machine that does all work
206     unique_ptr<CRegExFSA> m_FSM;
207 };
208 
209 
210 END_NCBI_SCOPE
211 
212 #endif /* UTIL___MULTIPATTERN_SEARCH__HPP */
213