1 #ifndef UTIL___MULTIPATTERN_SEARCH__HPP 2 #define UTIL___MULTIPATTERN_SEARCH__HPP 3 4 /* $Id: multipattern_search.hpp 568752 2018-08-09 20:30:17Z kachalos $ 5 * =========================================================================== 6 * 7 * PUBLIC DOMAIN NOTICE 8 * National Center for Biotechnology Information 9 * 10 * This software/database is a "United States Government Work" under the 11 * terms of the United States Copyright Act. It was written as part of 12 * the author's official duties as a United States Government employee and 13 * thus cannot be copyrighted. This software/database is freely available 14 * to the public for use. The National Library of Medicine and the U.S. 15 * Government have not placed any restriction on its use or reproduction. 16 * 17 * Although all reasonable efforts have been taken to ensure the accuracy 18 * and reliability of the software and data, the NLM and the U.S. 19 * Government do not and cannot warrant the performance or results that 20 * may be obtained by using this software or data. The NLM and the U.S. 21 * Government disclaim all warranties, express or implied, including 22 * warranties of performance, merchantability or fitness for any particular 23 * purpose. 24 * 25 * Please cite the author in any work or product based on this material. 26 * 27 * =========================================================================== 28 * 29 * Author: Sema Kachalo, NCBI 30 * 31 */ 32 33 /// @file multipattern_search.hpp 34 /// Simultaneous search of multiple RegEx patterns in the input string 35 36 #include <corelib/ncbistd.hpp> 37 #include <functional> 38 39 BEGIN_NCBI_SCOPE 40 41 class CRegExFSA; 42 43 44 /////////////////////////////////////////////////////////////////////// 45 /// 46 /// CMultipatternSearch 47 /// 48 /// Simultaneous search of multiple string or RegEx patterns in the input string 49 /// 50 /// CMultipatternSearch builds a Finite State Machine (FSM) 51 /// from a list of search strings or regular expression patterns. 52 /// It can then search all patterns simultaneously, 53 /// requiring only a single traversal of the input string. 54 /// Use this class to increase the search performance 55 /// when the number of search patterns is large (10 and more) 56 /// If the patterns are known in advance, FSM can be exported as C code 57 /// and compiled for the further performance improvement. 58 59 class NCBI_XUTIL_EXPORT CMultipatternSearch 60 { 61 public: 62 CMultipatternSearch(); 63 ~CMultipatternSearch(); 64 65 /// Search flags (for non-RegEx patterns only!) 66 enum EFlags { 67 fNoCase = 1 << 0, 68 fBeginString = 1 << 1, 69 fEndString = 1 << 2, 70 fWholeString = fBeginString | fEndString, 71 fBeginWord = 1 << 3, 72 fEndWord = 1 << 4, 73 fWholeWord = fBeginWord | fEndWord 74 }; 75 DECLARE_SAFE_FLAGS_TYPE(EFlags, TFlags); 76 77 78 ///@{ 79 /// Add search pattern to the FSM 80 /// 81 /// @param pattern 82 /// A search pattern to add to the FSM. 83 /// If begins with a slash (/), then it is considered a RegEx. 84 /// @param flags 85 /// Additional search conditions. 86 /// If the first argument is a RegEx, then the flags are ignored. 87 void AddPattern(const char* pattern, TFlags flags = 0); AddPattern(const string & pattern,TFlags flags=0)88 void AddPattern(const string& pattern, TFlags flags = 0) { AddPattern(pattern.c_str(), flags); } 89 void AddPatterns(const vector<string>& patterns); 90 void AddPatterns(const vector<pair<string, TFlags>>& patterns); 91 ///@} 92 93 /// Quote special characters to insert string into regular expression 94 static string QuoteString(const string& str); 95 96 /// Generate graphical representation of the FSM in DOT format. 97 /// For more details, see http://www.graphviz.org/ 98 /// 99 /// @param out 100 /// A stream to receive the output. 101 void GenerateDotGraph(ostream& out) const; 102 103 /// Generate C++ array/map data. 104 /// 105 /// @param out 106 /// A stream to receive the output. 107 void GenerateArrayMapData(ostream& out) const; 108 109 /// Generate C code for the FSM search. 110 /// 111 /// @param out 112 /// A stream to receive the output. 113 void GenerateSourceCode(ostream& out) const; 114 115 /// When the pattern is found, the search can be stopped or continued 116 enum EOnFind { 117 eStopSearch, 118 eContinueSearch 119 }; 120 121 ///@{ 122 /// Run the FSM search on the input string 123 /// 124 /// @param input 125 /// Input string. 126 /// @param found_callback 127 /// Function or function-like object to call when the pattern is found. 128 /// It can accept one or two parameters and return void or EOnFind. 129 /// If it returns eStopSearch, the search terminates. 130 /// If it returns eContinueSearch or void, the search continues. 131 /// @sa CFoundCallback 132 /// 133 /// @code 134 /// Search(str, [](size_t pattern) { cout << "Found " << pattern << "\n"; }); 135 /// Search(str, [](size_t pattern) -> CMultipatternSearch::EOnFind { cout << "Found " << pattern << "\n"; return CMultipatternSearch::eContinueSearch; }); 136 /// Search(str, [](size_t pattern, size_t position) { cout << "Found " << pattern << " " << position << "\n"; }); 137 /// Search(str, [](size_t pattern, size_t position) -> CMultipatternSearch::EOnFind { cout << "Found " << pattern << " " << position << "\n"; return CMultipatternSearch::eContinueSearch; }); 138 /// @endcode 139 140 typedef std::function<void(size_t)> VoidCall1; 141 typedef std::function<void(size_t, size_t)> VoidCall2; 142 typedef std::function<bool(size_t)> BoolCall1; 143 typedef std::function<bool(size_t, size_t)> BoolCall2; 144 145 void Search(const char* input, VoidCall1 found_callback) const; Search(const string & input,VoidCall1 found_callback) const146 void Search(const string& input, VoidCall1 found_callback) const { Search(input.c_str(), found_callback); } 147 void Search(const char* input, VoidCall2 found_callback) const; Search(const string & input,VoidCall2 found_callback) const148 void Search(const string& input, VoidCall2 found_callback) const { Search(input.c_str(), found_callback); } 149 void Search(const char* input, BoolCall1 found_callback) const; Search(const string & input,BoolCall1 found_callback) const150 void Search(const string& input, BoolCall1 found_callback) const { Search(input.c_str(), found_callback); } 151 void Search(const char* input, BoolCall2 found_callback) const; Search(const string & input,BoolCall2 found_callback) const152 void Search(const string& input, BoolCall2 found_callback) const { Search(input.c_str(), found_callback); } 153 ///@} 154 155 ///@{ 156 /// Run the FSM search on the input string using the strucutre prebuilt by multipattern -A 157 /// 158 /// @param input 159 /// Input string. 160 /// @param states 161 /// generated by multipattern -A 162 /// @param emit 163 /// generated by multipattern -A 164 /// @param hits 165 /// generated by multipattern -A 166 /// @param found_callback 167 /// Function or function-like object to call when the pattern is found. 168 /// It can accept one or two parameters and return void or EOnFind. 169 /// If it returns eStopSearch, the search terminates. 170 /// If it returns eContinueSearch or void, the search continues. 171 /// @sa CFoundCallback 172 /// 173 /// @code 174 /// Search(str, [](size_t pattern) { cout << "Found " << pattern << "\n"; }); 175 /// Search(str, [](size_t pattern) -> CMultipatternSearch::EOnFind { cout << "Found " << pattern << "\n"; return CMultipatternSearch::eContinueSearch; }); 176 /// Search(str, [](size_t pattern, size_t position) { cout << "Found " << pattern << " " << position << "\n"; }); 177 /// Search(str, [](size_t pattern, size_t position) -> CMultipatternSearch::EOnFind { cout << "Found " << pattern << " " << position << "\n"; return CMultipatternSearch::eContinueSearch; }); 178 /// @endcode 179 /// 180 /// Example: 181 /// 182 /// static void Screen(const char* str, bool *result) 183 /// { 184 /// #define _FSM_EMIT static bool emit[] 185 /// #define _FSM_HITS static map<size_t, vector<size_t>> hits 186 /// #define _FSM_STATES static size_t states[] 187 /// #include "GENERATED_FILE.inc" 188 /// #undef _FSM_EMIT 189 /// #undef _FSM_HITS 190 /// #undef _FSM_STATES 191 /// CMultipatternSearch::Search(str, states, emit, hits, [result](size_t n) { result[n] = true; }); 192 /// } 193 194 static void Search(const char* input, const size_t* states, const bool* emit, const map<size_t, vector<size_t>>& hits, VoidCall1 found_callback); Search(const string & input,const size_t * states,const bool * emit,const map<size_t,vector<size_t>> & hits,VoidCall1 found_callback)195 static void Search(const string& input, const size_t* states, const bool* emit, const map<size_t, vector<size_t>>& hits, VoidCall1 found_callback) { Search(input.c_str(), states, emit, hits, found_callback); } 196 static void Search(const char* input, const size_t* states, const bool* emit, const map<size_t, vector<size_t>>& hits, VoidCall2 found_callback); Search(const string & input,const size_t * states,const bool * emit,const map<size_t,vector<size_t>> & hits,VoidCall2 found_callback)197 static void Search(const string& input, const size_t* states, const bool* emit, const map<size_t, vector<size_t>>& hits, VoidCall2 found_callback) { Search(input.c_str(), states, emit, hits, found_callback); } 198 static void Search(const char* input, const size_t* states, const bool* emit, const map<size_t, vector<size_t>>& hits, BoolCall1 found_callback); Search(const string & input,const size_t * states,const bool * emit,const map<size_t,vector<size_t>> & hits,BoolCall1 found_callback)199 static void Search(const string& input, const size_t* states, const bool* emit, const map<size_t, vector<size_t>>& hits, BoolCall1 found_callback) { Search(input.c_str(), states, emit, hits, found_callback); } 200 static void Search(const char* input, const size_t* states, const bool* emit, const map<size_t, vector<size_t>>& hits, BoolCall2 found_callback); Search(const string & input,const size_t * states,const bool * emit,const map<size_t,vector<size_t>> & hits,BoolCall2 found_callback)201 static void Search(const string& input, const size_t* states, const bool* emit, const map<size_t, vector<size_t>>& hits, BoolCall2 found_callback) { Search(input.c_str(), states, emit, hits, found_callback); } 202 /// @endcode 203 204 private: 205 /// Finit State Machine that does all work 206 unique_ptr<CRegExFSA> m_FSM; 207 }; 208 209 210 END_NCBI_SCOPE 211 212 #endif /* UTIL___MULTIPATTERN_SEARCH__HPP */ 213