1 #ifndef UTIL___REGEXP_TEMPLATE_TESTER__HPP
2 #define UTIL___REGEXP_TEMPLATE_TESTER__HPP
3 
4 /*  $Id: regexp_template_tester.hpp 575895 2018-12-06 13:05:45Z ivanov $
5  * ===========================================================================
6  *
7  *                            PUBLIC DOMAIN NOTICE
8  *               National Center for Biotechnology Information
9  *
10  *  This software/database is a "United States Government Work" under the
11  *  terms of the United States Copyright Act.  It was written as part of
12  *  the author's official duties as a United States Government employee and
13  *  thus cannot be copyrighted.  This software/database is freely available
14  *  to the public for use. The National Library of Medicine and the U.S.
15  *  Government have not placed any restriction on its use or reproduction.
16  *
17  *  Although all reasonable efforts have been taken to ensure the accuracy
18  *  and reliability of the software and data, the NLM and the U.S.
19  *  Government do not and cannot warrant the performance or results that
20  *  may be obtained by using this software or data. The NLM and the U.S.
21  *  Government disclaim all warranties, express or implied, including
22  *  warranties of performance, merchantability or fitness for any particular
23  *  purpose.
24  *
25  *  Please cite the author in any work or product based on this material.
26  *
27  * ===========================================================================
28  *
29  * Author: Vladimir Ivanov
30  *
31  */
32 
33 /// @file regexp_template_tester.hpp
34 /// Regexp template tester based on the Perl Compatible Regular Expression
35 /// (pcre) library. Allow to test delimited text data against template
36 /// with regular expression rules.
37 
38 #include <corelib/ncbistr.hpp>
39 
40 BEGIN_NCBI_SCOPE
41 
42 
43 /** @addtogroup Regexp
44  *
45  * @{
46  */
47 
48 /////////////////////////////////////////////////////////////////////////////
49 ///
50 /// CRegexpTemplateTester --
51 ///
52 /// Match delimited text data against template with regular expression rules.
53 ///
54 /// Template file can have some types of lines:
55 ///    1) comment line, started with // symbols (can be changed);
56 ///    2) command lines, started with # symbol (can be changed);
57 ///    3) regular expressions, that are used to match corresponding line
58 ///       in the tested file/stream. Usually number of lines in the tested file
59 ///       should be the same as the number of rule lines in the template,
60 ///       unless you use 'stop' or 'skip' template commands (see below).
61 ///
62 /// Template commands allow to use variables. Each variable should be surrounded
63 /// with figure brackets with preceding $ symbol, like ${var}. Except 'set'
64 /// command where we define variable. Note that this can be changed and you can use
65 /// any other symbols to separate variables.
66 ///
67 /// Commands:
68 ///
69 ///    1) set var=value
70 ///          Define variable 'var', set it to 'value'.
71 ///          'value' can be string, any other variable or combination of it.
72 ///          This type of variable works almost as constants in the C++,
73 ///          they can be useful to predefine some frequently used values
74 ///          in the regular expression rules. All variable names starts with
75 ///          a letter and can have numbers, letters, '-' or '_'.
76 ///          Example:
77 ///              # set fruit=apple
78 ///              # set color=red
79 ///              # set str=${color} ${fruit}
80 ///              // variable 'str' have value 'red apple' now
81 ///              This is a rule line with a variable ${str}
82 ///
83 ///    2) echo <string>
84 ///          Prints string/variables to stdout. Can be useful for debug purposes.
85 ///          Example:
86 ///              # set fruit=apple
87 ///              # echo some string
88 ///              # echo ${fruit}
89 ///              # echo We have an ${fruit}
90 ///
91 ///    3) test <expression>
92 ///          Compare variable with a string value.
93 ///          'value' can be string, any other variable or combination of it.
94 ///          Allowed comparing operators:
95 ///              ==  - left value         equal to right value
96 ///              !=  - left value doesn't equal to right value.
97 ///              =~  - left value         match to regexp on right
98 ///              !~  - left value doesn't match to regexp on right.
99 ///          Spaces around the comparison operator counts as part of
100 ///          the comparing strings on left and right accordingly.
101 ///          Example:
102 ///              # set fruit=apple
103 ///              # set color=red
104 ///              # test ${fruit}==apple
105 ///              # test ${fruit}!=orange
106 ///              # test red ${fruit}==red apple
107 ///              # test ${color} ${fruit}==red apple
108 ///              # test ${fruit}=~\w{3,}
109 ///              # set re=\w{3,}
110 ///              # test ${fruit}=~${re}
111 ///
112 ///    4) include path
113 ///          Include sub-template with name 'path' and process it.
114 ///          This allow to split big templates to parts, or separate common
115 ///          part of many templates. 'path' defines a relative path to
116 ///          the sub-template from the directory of the template that includes it.
117 ///          Sub-templates can include any other templates.
118 ///          Example:
119 ///              # include common_header.tpl
120 ///              # include tests/test1
121 ///              # include tests/test2
122 ///
123 ///   5) inline variables
124 ///          One more way to define variables based on the data in the matching
125 ///          string. For example, comparing file can have such line:
126 ///              We have 5 apples.
127 ///
128 ///          Corresponding template file may have next rules:
129 ///              We have ${count=[1-9]+} ${fruit=[a-z]+}\.
130 ///              # test ${count}==5
131 ///              # test ${fruit}==apples
132 ///
133 ///   6) skip <expression>
134 ///          Allow to skip lines in the tested file/stream.
135 ///          It is possible to skip:
136 ///            - fixed number of lines;
137 ///            - while lines match specified regular expression (find not);
138 ///            - until some line will match specified regular expression (find);
139 ///          Regular expressions can use variables here as well.
140 ///          Note, that for while/until cases, testing stops on a line that
141 ///          don't/meet specified criteria. So you can parse it with a different
142 ///          rule again. If you don't need that line you can use 'skip 1' command
143 ///          on the next template line or a regular expression that matches any
144 ///          string, like '.*'.
145 ///          Example:
146 ///              // Skip next 5 lines in the tested file/stream
147 ///              # skip 5
148 ///              // Skip all next lines started with a capital letter
149 ///              # skip while ^[A-Z]+
150 ///              // Skip all next lines until we find a string that have
151 ///              // some digits followed with 'apple' word.
152 ///              # set fruit=apple
153 ///              # skip until \d+ ${fruit}
154 ///              We have ${count=\d+} ${fruit}
155 ///              # test ${count}==1
156 ///
157 ///   7) stop
158 ///          Stops processing the current template. If located in the main template,
159 ///          it forces Compare() methods to return immediatle. Can be useful for
160 ///          debug purposes or if you want to check only some part of the file,
161 ///          ignoring any lines below some point. Note, that if 'stop' command have
162 ///          used in the sub-template, testing engine will stop processing that
163 ///          sub-template only and continue processing from the next line in
164 ///          the parent template.
165 ///          Example:
166 ///              # stop
167 ///
168 /// Nested variables.
169 ///
170 ///   You can use nested variables in any expression and every command that support it.
171 ///   They can be useful in some cases,  especially if you want to use the same symbols
172 ///   used to define variables. For example, you can write next template line:
173 ///
174 ///          We have ${count=[1-9]+} ${fruit=[a-z]+}\.
175 ///
176 ///   It have correct syntax, but what if you want to make it a bit complex or strict?
177 ///   For example next template line should generate parsing errors:
178 ///
179 ///          // Match 2 or more digit numbers and fruits names with 4 letters at least
180 ///          We have ${count=\d{2,}} ${fruit=\w{4,}}\.
181 ///
182 ///   This is because {...} is used inside variable defnition. As solution you can
183 ///   choice different symbols that will define the start and end sequences for all
184 ///   variables, or use nested variables as specified below:
185 ///
186 ///          # set d2=\d{2,}
187 ///          # set w4=\w{4,}
188 ///          We have ${count=${d2}} ${fruit=${w4}}\.
189 ///
190 ///   Also, you can construct variable names on the fly:
191 ///
192 ///          # set color=red
193 ///          # echo ${${color}_${apple}}
194 ///
195 ///
196 /// Note, that all methods throw CRegexpTemplateTesterException on errors,
197 /// problems with parsing template commands or data mismatch.
198 ///
199 
200 class CRegexpTemplateTester
201 {
202 public:
203     ///
204     enum EFlags {
205         fSkipEmptySourceLines   = (1 << 0),  ///< Skip empty lines in the source
206         fSkipEmptyTemplateLines = (1 << 1),  ///< Skip empty lines in the template
207         fSkipEmptyLines = fSkipEmptySourceLines | fSkipEmptyTemplateLines
208     };
209     typedef unsigned int TFlags;    ///< Binary OR of "EFlags"
210 
211     /// Default constructor.
212     CRegexpTemplateTester(TFlags flags = 0);
213 
214 
215     /// Compare file against template (file version).
216     ///
217     /// @param file_path
218     ///   Path to the checking file.
219     /// @param template_path
220     ///   Path to the corresponding template.
221     /// @return
222     ///   Nothing on success.
223     ///   Throw CRegexpTemplateTesterException on error or mismatch.
224     ///
225     void Compare(const string& file_path, const string& template_path);
226 
227     /// Compare file against template (stream version).
228     ///
229     /// @param file_stream
230     ///   Input stream with checking data.
231     /// @param template_stream
232     ///   Input stream with corresponding template data.
233     /// @return
234     ///   Nothing on success.
235     ///   Throw CRegexpTemplateTesterException on error or mismatch.
236     /// @note
237     ///   Due to file-oriented nature of the 'include' command in the
238     ///   templates, it works a little different that in the file-based
239     ///   version.  It is better do not use 'include' with streams at all,
240     ///   but if you want it, be aware.  We don't have path to the directory
241     ///   with the original template, so included sub-template should be
242     ///   located in the current directory.
243     ///
244     void Compare(istream& file_stream, istream& template_stream);
245 
246 
247     // --- Auxiliary methods used to tune up parsing for specific needs.
248 
249     /// Change strings defining start/end of variables.
250     /// By default use next syntax: ${var}
251     void SetVarScope(string& start, string& end);
252 
253     /// Change string defining start of comments line in templates.
254     /// Default value: "//"
255     void SetCommentStart(string& str);
256 
257     /// Change string defining start of template commands and operations.
258     /// Default value: "#"
259     void SetCommandStart(string& str);
260 
261     /// Change delimiters string, used for comparing data and templates.
262     /// Default value: "\r\n"
263     void SetDelimiters(string& str);
264 
265 
266     // --- Debug functions
267 
268     void   PrintVars(void) const;
269     void   PrintVar (const string& name) const;
270     string GetVar   (const string& name) const;
271 
272     typedef map<string, string> TVarMap;
273     const TVarMap& GetVars(void) const;
274 
275 private:
276     // Operations
277     void  x_Op_Set     (CTempString str);
278     void  x_Op_Echo    (CTempString str);
279     void  x_Op_Test    (CTempString str);
280     void  x_Op_Include (CTempString str, istream& file_stm);
281     void  x_Op_Skip    (CTempString str, istream& file_stm);
282 
283 private:
284     // Internal methods
285 
286     /// Reset object state
287     void x_Reset(void);
288 
289     /// Processing source
290     enum ESource {
291         eFile,     ///< source file/stream
292         eTemplate  ///< template
293     };
294 
295     /// The reason of stopping x_Compare(), if no error.
296     enum EResult {
297         eTemplateEOF,
298         eStop
299     };
300     typedef list<string> TVarList;
301 
302     /// Main compare method, compare streams.
303     /// Can be used recursively to process includes.
304     /// Return TRUE if 'stop' command found.
305     EResult x_Compare(istream& file_stream, istream& template_stream);
306 
307     /// Process/compare lines.
308     void x_CompareLines(CTempString file_line, CTempString template_line);
309 
310     /// Parse variable from string, return its length.
311     /// Can process nested variables.
312     SIZE_TYPE x_ParseVar(CTempString str, SIZE_TYPE pos) const;
313 
314     /// Parse/check variable name from string, return its length.
315     SIZE_TYPE x_ParseVarName (CTempString str, SIZE_TYPE pos) const;
316 
317     /// Replace all variables in the string with corresponding values.
318     /// Also used for preparing inline variables for consecutive regexp matching.
319     string x_SubstituteVars(CTempString str, TVarList* inline_vars) const;
320 
321     /// Get line from the stream 'is'.
322     istream& x_GetLine(istream& is, ESource src);
323 
324 private:
325     TFlags    m_Flags;           ///< Processing flags
326     string    m_VarStart;        ///< Variable definition start
327     string    m_VarEnd;          ///< Variable definition end
328     string    m_OpStart;         ///< Start of the template command line
329     string    m_CommentStart;    ///< Start of the comment line
330     string    m_EOLs;            ///< Lines delimiters
331     TVarMap   m_Vars;            ///< Map of variables/values
332 
333     // Variables for currently processing file/(sub-)template
334     string    m_FileName;        ///< Current file name (if any)
335     string    m_FileLine;        ///< Currently processing file line
336     SIZE_TYPE m_FileLineNum;     ///< Current file/stream line number
337     string    m_TemplateName;    ///< Current template name
338     string    m_TemplateLine;    ///< Currently processing template line
339     SIZE_TYPE m_TemplateLineNum; ///< Current template line number
340 
341     bool  m_ReprocessFileLine;  ///< TRUE if m_FileLine should be reprocessed
342                                 ///< with next template data line
343 };
344 
345 
346 
347 /////////////////////////////////////////////////////////////////////////////
348 ///
349 /// CRegexpTemplateTesterException --
350 ///
351 /// Define exceptions generated by CRegexpTemplateTester.
352 
353 class CRegexpTemplateTesterException : public CCoreException
354 {
355 public:
356     /// Error types that tester can generate.
357     enum EErrCode {
358         eOpenFile,         ///< file open error
359         eMismatchLength,   ///< file/template line number mismatch
360         eMismatchContent,  ///< file/template lines do not match
361         eVarNotFound,      ///< variable not found
362         eVarErr,           ///< variable definition error
363         eOpUnknown,        ///< unknown operation
364         eOpErr,            ///< operation definition error
365         eOpTest            ///< 'test' operation return FALSE
366     } ;
367     virtual const char* GetErrCodeString(void) const override;
368     NCBI_EXCEPTION_DEFAULT(CRegexpTemplateTesterException, CCoreException);
369 };
370 
371 
372 /* @} */
373 
374 END_NCBI_SCOPE
375 
376 
377 #endif /* UTIL___REGEXP_TEMPLATE_TESTER__HPP */
378