1 #ifndef UTIL___REGEXP_TEMPLATE_TESTER__HPP 2 #define UTIL___REGEXP_TEMPLATE_TESTER__HPP 3 4 /* $Id: regexp_template_tester.hpp 575895 2018-12-06 13:05:45Z ivanov $ 5 * =========================================================================== 6 * 7 * PUBLIC DOMAIN NOTICE 8 * National Center for Biotechnology Information 9 * 10 * This software/database is a "United States Government Work" under the 11 * terms of the United States Copyright Act. It was written as part of 12 * the author's official duties as a United States Government employee and 13 * thus cannot be copyrighted. This software/database is freely available 14 * to the public for use. The National Library of Medicine and the U.S. 15 * Government have not placed any restriction on its use or reproduction. 16 * 17 * Although all reasonable efforts have been taken to ensure the accuracy 18 * and reliability of the software and data, the NLM and the U.S. 19 * Government do not and cannot warrant the performance or results that 20 * may be obtained by using this software or data. The NLM and the U.S. 21 * Government disclaim all warranties, express or implied, including 22 * warranties of performance, merchantability or fitness for any particular 23 * purpose. 24 * 25 * Please cite the author in any work or product based on this material. 26 * 27 * =========================================================================== 28 * 29 * Author: Vladimir Ivanov 30 * 31 */ 32 33 /// @file regexp_template_tester.hpp 34 /// Regexp template tester based on the Perl Compatible Regular Expression 35 /// (pcre) library. Allow to test delimited text data against template 36 /// with regular expression rules. 37 38 #include <corelib/ncbistr.hpp> 39 40 BEGIN_NCBI_SCOPE 41 42 43 /** @addtogroup Regexp 44 * 45 * @{ 46 */ 47 48 ///////////////////////////////////////////////////////////////////////////// 49 /// 50 /// CRegexpTemplateTester -- 51 /// 52 /// Match delimited text data against template with regular expression rules. 53 /// 54 /// Template file can have some types of lines: 55 /// 1) comment line, started with // symbols (can be changed); 56 /// 2) command lines, started with # symbol (can be changed); 57 /// 3) regular expressions, that are used to match corresponding line 58 /// in the tested file/stream. Usually number of lines in the tested file 59 /// should be the same as the number of rule lines in the template, 60 /// unless you use 'stop' or 'skip' template commands (see below). 61 /// 62 /// Template commands allow to use variables. Each variable should be surrounded 63 /// with figure brackets with preceding $ symbol, like ${var}. Except 'set' 64 /// command where we define variable. Note that this can be changed and you can use 65 /// any other symbols to separate variables. 66 /// 67 /// Commands: 68 /// 69 /// 1) set var=value 70 /// Define variable 'var', set it to 'value'. 71 /// 'value' can be string, any other variable or combination of it. 72 /// This type of variable works almost as constants in the C++, 73 /// they can be useful to predefine some frequently used values 74 /// in the regular expression rules. All variable names starts with 75 /// a letter and can have numbers, letters, '-' or '_'. 76 /// Example: 77 /// # set fruit=apple 78 /// # set color=red 79 /// # set str=${color} ${fruit} 80 /// // variable 'str' have value 'red apple' now 81 /// This is a rule line with a variable ${str} 82 /// 83 /// 2) echo <string> 84 /// Prints string/variables to stdout. Can be useful for debug purposes. 85 /// Example: 86 /// # set fruit=apple 87 /// # echo some string 88 /// # echo ${fruit} 89 /// # echo We have an ${fruit} 90 /// 91 /// 3) test <expression> 92 /// Compare variable with a string value. 93 /// 'value' can be string, any other variable or combination of it. 94 /// Allowed comparing operators: 95 /// == - left value equal to right value 96 /// != - left value doesn't equal to right value. 97 /// =~ - left value match to regexp on right 98 /// !~ - left value doesn't match to regexp on right. 99 /// Spaces around the comparison operator counts as part of 100 /// the comparing strings on left and right accordingly. 101 /// Example: 102 /// # set fruit=apple 103 /// # set color=red 104 /// # test ${fruit}==apple 105 /// # test ${fruit}!=orange 106 /// # test red ${fruit}==red apple 107 /// # test ${color} ${fruit}==red apple 108 /// # test ${fruit}=~\w{3,} 109 /// # set re=\w{3,} 110 /// # test ${fruit}=~${re} 111 /// 112 /// 4) include path 113 /// Include sub-template with name 'path' and process it. 114 /// This allow to split big templates to parts, or separate common 115 /// part of many templates. 'path' defines a relative path to 116 /// the sub-template from the directory of the template that includes it. 117 /// Sub-templates can include any other templates. 118 /// Example: 119 /// # include common_header.tpl 120 /// # include tests/test1 121 /// # include tests/test2 122 /// 123 /// 5) inline variables 124 /// One more way to define variables based on the data in the matching 125 /// string. For example, comparing file can have such line: 126 /// We have 5 apples. 127 /// 128 /// Corresponding template file may have next rules: 129 /// We have ${count=[1-9]+} ${fruit=[a-z]+}\. 130 /// # test ${count}==5 131 /// # test ${fruit}==apples 132 /// 133 /// 6) skip <expression> 134 /// Allow to skip lines in the tested file/stream. 135 /// It is possible to skip: 136 /// - fixed number of lines; 137 /// - while lines match specified regular expression (find not); 138 /// - until some line will match specified regular expression (find); 139 /// Regular expressions can use variables here as well. 140 /// Note, that for while/until cases, testing stops on a line that 141 /// don't/meet specified criteria. So you can parse it with a different 142 /// rule again. If you don't need that line you can use 'skip 1' command 143 /// on the next template line or a regular expression that matches any 144 /// string, like '.*'. 145 /// Example: 146 /// // Skip next 5 lines in the tested file/stream 147 /// # skip 5 148 /// // Skip all next lines started with a capital letter 149 /// # skip while ^[A-Z]+ 150 /// // Skip all next lines until we find a string that have 151 /// // some digits followed with 'apple' word. 152 /// # set fruit=apple 153 /// # skip until \d+ ${fruit} 154 /// We have ${count=\d+} ${fruit} 155 /// # test ${count}==1 156 /// 157 /// 7) stop 158 /// Stops processing the current template. If located in the main template, 159 /// it forces Compare() methods to return immediatle. Can be useful for 160 /// debug purposes or if you want to check only some part of the file, 161 /// ignoring any lines below some point. Note, that if 'stop' command have 162 /// used in the sub-template, testing engine will stop processing that 163 /// sub-template only and continue processing from the next line in 164 /// the parent template. 165 /// Example: 166 /// # stop 167 /// 168 /// Nested variables. 169 /// 170 /// You can use nested variables in any expression and every command that support it. 171 /// They can be useful in some cases, especially if you want to use the same symbols 172 /// used to define variables. For example, you can write next template line: 173 /// 174 /// We have ${count=[1-9]+} ${fruit=[a-z]+}\. 175 /// 176 /// It have correct syntax, but what if you want to make it a bit complex or strict? 177 /// For example next template line should generate parsing errors: 178 /// 179 /// // Match 2 or more digit numbers and fruits names with 4 letters at least 180 /// We have ${count=\d{2,}} ${fruit=\w{4,}}\. 181 /// 182 /// This is because {...} is used inside variable defnition. As solution you can 183 /// choice different symbols that will define the start and end sequences for all 184 /// variables, or use nested variables as specified below: 185 /// 186 /// # set d2=\d{2,} 187 /// # set w4=\w{4,} 188 /// We have ${count=${d2}} ${fruit=${w4}}\. 189 /// 190 /// Also, you can construct variable names on the fly: 191 /// 192 /// # set color=red 193 /// # echo ${${color}_${apple}} 194 /// 195 /// 196 /// Note, that all methods throw CRegexpTemplateTesterException on errors, 197 /// problems with parsing template commands or data mismatch. 198 /// 199 200 class CRegexpTemplateTester 201 { 202 public: 203 /// 204 enum EFlags { 205 fSkipEmptySourceLines = (1 << 0), ///< Skip empty lines in the source 206 fSkipEmptyTemplateLines = (1 << 1), ///< Skip empty lines in the template 207 fSkipEmptyLines = fSkipEmptySourceLines | fSkipEmptyTemplateLines 208 }; 209 typedef unsigned int TFlags; ///< Binary OR of "EFlags" 210 211 /// Default constructor. 212 CRegexpTemplateTester(TFlags flags = 0); 213 214 215 /// Compare file against template (file version). 216 /// 217 /// @param file_path 218 /// Path to the checking file. 219 /// @param template_path 220 /// Path to the corresponding template. 221 /// @return 222 /// Nothing on success. 223 /// Throw CRegexpTemplateTesterException on error or mismatch. 224 /// 225 void Compare(const string& file_path, const string& template_path); 226 227 /// Compare file against template (stream version). 228 /// 229 /// @param file_stream 230 /// Input stream with checking data. 231 /// @param template_stream 232 /// Input stream with corresponding template data. 233 /// @return 234 /// Nothing on success. 235 /// Throw CRegexpTemplateTesterException on error or mismatch. 236 /// @note 237 /// Due to file-oriented nature of the 'include' command in the 238 /// templates, it works a little different that in the file-based 239 /// version. It is better do not use 'include' with streams at all, 240 /// but if you want it, be aware. We don't have path to the directory 241 /// with the original template, so included sub-template should be 242 /// located in the current directory. 243 /// 244 void Compare(istream& file_stream, istream& template_stream); 245 246 247 // --- Auxiliary methods used to tune up parsing for specific needs. 248 249 /// Change strings defining start/end of variables. 250 /// By default use next syntax: ${var} 251 void SetVarScope(string& start, string& end); 252 253 /// Change string defining start of comments line in templates. 254 /// Default value: "//" 255 void SetCommentStart(string& str); 256 257 /// Change string defining start of template commands and operations. 258 /// Default value: "#" 259 void SetCommandStart(string& str); 260 261 /// Change delimiters string, used for comparing data and templates. 262 /// Default value: "\r\n" 263 void SetDelimiters(string& str); 264 265 266 // --- Debug functions 267 268 void PrintVars(void) const; 269 void PrintVar (const string& name) const; 270 string GetVar (const string& name) const; 271 272 typedef map<string, string> TVarMap; 273 const TVarMap& GetVars(void) const; 274 275 private: 276 // Operations 277 void x_Op_Set (CTempString str); 278 void x_Op_Echo (CTempString str); 279 void x_Op_Test (CTempString str); 280 void x_Op_Include (CTempString str, istream& file_stm); 281 void x_Op_Skip (CTempString str, istream& file_stm); 282 283 private: 284 // Internal methods 285 286 /// Reset object state 287 void x_Reset(void); 288 289 /// Processing source 290 enum ESource { 291 eFile, ///< source file/stream 292 eTemplate ///< template 293 }; 294 295 /// The reason of stopping x_Compare(), if no error. 296 enum EResult { 297 eTemplateEOF, 298 eStop 299 }; 300 typedef list<string> TVarList; 301 302 /// Main compare method, compare streams. 303 /// Can be used recursively to process includes. 304 /// Return TRUE if 'stop' command found. 305 EResult x_Compare(istream& file_stream, istream& template_stream); 306 307 /// Process/compare lines. 308 void x_CompareLines(CTempString file_line, CTempString template_line); 309 310 /// Parse variable from string, return its length. 311 /// Can process nested variables. 312 SIZE_TYPE x_ParseVar(CTempString str, SIZE_TYPE pos) const; 313 314 /// Parse/check variable name from string, return its length. 315 SIZE_TYPE x_ParseVarName (CTempString str, SIZE_TYPE pos) const; 316 317 /// Replace all variables in the string with corresponding values. 318 /// Also used for preparing inline variables for consecutive regexp matching. 319 string x_SubstituteVars(CTempString str, TVarList* inline_vars) const; 320 321 /// Get line from the stream 'is'. 322 istream& x_GetLine(istream& is, ESource src); 323 324 private: 325 TFlags m_Flags; ///< Processing flags 326 string m_VarStart; ///< Variable definition start 327 string m_VarEnd; ///< Variable definition end 328 string m_OpStart; ///< Start of the template command line 329 string m_CommentStart; ///< Start of the comment line 330 string m_EOLs; ///< Lines delimiters 331 TVarMap m_Vars; ///< Map of variables/values 332 333 // Variables for currently processing file/(sub-)template 334 string m_FileName; ///< Current file name (if any) 335 string m_FileLine; ///< Currently processing file line 336 SIZE_TYPE m_FileLineNum; ///< Current file/stream line number 337 string m_TemplateName; ///< Current template name 338 string m_TemplateLine; ///< Currently processing template line 339 SIZE_TYPE m_TemplateLineNum; ///< Current template line number 340 341 bool m_ReprocessFileLine; ///< TRUE if m_FileLine should be reprocessed 342 ///< with next template data line 343 }; 344 345 346 347 ///////////////////////////////////////////////////////////////////////////// 348 /// 349 /// CRegexpTemplateTesterException -- 350 /// 351 /// Define exceptions generated by CRegexpTemplateTester. 352 353 class CRegexpTemplateTesterException : public CCoreException 354 { 355 public: 356 /// Error types that tester can generate. 357 enum EErrCode { 358 eOpenFile, ///< file open error 359 eMismatchLength, ///< file/template line number mismatch 360 eMismatchContent, ///< file/template lines do not match 361 eVarNotFound, ///< variable not found 362 eVarErr, ///< variable definition error 363 eOpUnknown, ///< unknown operation 364 eOpErr, ///< operation definition error 365 eOpTest ///< 'test' operation return FALSE 366 } ; 367 virtual const char* GetErrCodeString(void) const override; 368 NCBI_EXCEPTION_DEFAULT(CRegexpTemplateTesterException, CCoreException); 369 }; 370 371 372 /* @} */ 373 374 END_NCBI_SCOPE 375 376 377 #endif /* UTIL___REGEXP_TEMPLATE_TESTER__HPP */ 378