1 /*$Id: regexp_template_tester.cpp 467289 2015-05-12 16:23:41Z ivanov $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author:  Vladimir Ivanov
27  *
28  * File Description:
29  *      Regexp template tester based on the Perl Compatible Regular
30  *      Expression (pcre) library. Allow to test delimited text data
31  *      against template with regular expression rules.
32  *
33  */
34 
35 
36 #include <ncbi_pch.hpp>
37 #include <corelib/ncbifile.hpp>
38 #include <corelib/tempstr.hpp>
39 #include <util/xregexp/regexp.hpp>
40 #include <util/xregexp/regexp_template_tester.hpp>
41 
42 
43 BEGIN_NCBI_SCOPE
44 
45 
46 //////////////////////////////////////////////////////////////////////////////
47 //
48 // Macros for throwing an exceptions on error
49 //
50 
51 #define FILE_NAME(name) \
52     (name.empty() ? string("(stream)") : name)
53 
54 #define ERROR_FILE(errcode, message, file) \
55     NCBI_THROW(CRegexpTemplateTesterException, errcode, \
56               FILE_NAME(file) + " -- " + message)
57 
58 #define ERROR_TEMPLATE(errcode, message) \
59     NCBI_THROW(CRegexpTemplateTesterException, errcode, "\n" + \
60               FILE_NAME(m_FileName) + "(" + \
61               NStr::NumericToString(m_FileLineNum) + ")\n" + \
62               FILE_NAME(m_TemplateName) + "(" + \
63               NStr::NumericToString(m_TemplateLineNum) + ")\n-- " + \
64               message)
65 
66 
67 
68 /////////////////////////////////////////////////////////////////////////////
69 ///
70 /// CRegexpTemplateTester --
71 ///
72 
73 
CRegexpTemplateTester(TFlags flags)74 CRegexpTemplateTester::CRegexpTemplateTester(TFlags flags)
75     : m_Flags(flags),
76       m_VarStart("${"),
77       m_VarEnd("}"),
78       m_OpStart("#"),
79       m_CommentStart("//"),
80       m_EOLs("\r\n")
81 {
82     x_Reset();
83 }
84 
85 
x_Reset()86 void CRegexpTemplateTester::x_Reset()
87 {
88     // File/template
89     m_FileName.clear();
90     m_FileLineNum = 0;
91     m_TemplateName.clear();
92     m_TemplateLineNum = 0;
93 
94     // Variables
95     m_Vars.clear();
96 
97     // Cached line
98     m_FileLine.clear();
99     m_ReprocessFileLine = false;
100 }
101 
102 
SetVarScope(string & start,string & end)103 void CRegexpTemplateTester::SetVarScope(string& start, string& end)
104 {
105     m_VarStart = start;
106     m_VarEnd = end;
107 }
108 
109 
SetCommentStart(string & str)110 void CRegexpTemplateTester::SetCommentStart(string& str)
111 {
112     m_CommentStart = str;
113 }
114 
115 
SetCommandStart(string & str)116 void CRegexpTemplateTester::SetCommandStart(string& str)
117 {
118     m_OpStart = str;
119 }
120 
121 
SetDelimiters(string & str)122 void CRegexpTemplateTester::SetDelimiters(string& str)
123 {
124     m_EOLs = str;
125 }
126 
127 
GetVar(const string & name) const128 string CRegexpTemplateTester::GetVar(const string& name) const
129 {
130     TVarMap::const_iterator var = m_Vars.find(name);
131     if (var == m_Vars.end()) {
132         ERROR_TEMPLATE(eVarNotFound, string("variable '") + name + "' is not defined");
133     }
134     return var->second;
135 }
136 
137 
GetVars(void) const138 const CRegexpTemplateTester::TVarMap& CRegexpTemplateTester::GetVars(void) const
139 {
140     return m_Vars;
141 }
142 
143 
PrintVar(const string & name) const144 void CRegexpTemplateTester::PrintVar(const string& name) const
145 {
146     string value = GetVar(name);
147     cout << name << " = " << NStr::PrintableString(value) << endl;
148 }
149 
150 
PrintVars(void) const151 void CRegexpTemplateTester::PrintVars(void) const
152 {
153     ITERATE(TVarMap, it, m_Vars) {
154         cout << it->first << " = " << NStr::PrintableString(it->second) << endl;
155     }
156 }
157 
158 
Compare(const string & file_path,const string & template_path)159 void CRegexpTemplateTester::Compare(const string& file_path, const string& template_path)
160 {
161     x_Reset();
162     m_FileName = file_path;
163     m_TemplateName = template_path;
164 
165     CNcbiIfstream file_stm(file_path.c_str());
166     if (!file_stm.good()) {
167         ERROR_FILE(eOpenFile, "cannot open file", file_path);
168     }
169     CNcbiIfstream template_stm(template_path.c_str());
170     if (!template_stm.good()) {
171         ERROR_FILE(eOpenFile, "cannot open file", template_path);
172     }
173     // Compare
174     if ( x_Compare(file_stm, template_stm) == eStop ) {
175         return;
176     }
177     // Compare number of lines in both files
178     if (x_GetLine(file_stm, eFile)) {
179         ERROR_TEMPLATE(eMismatchLength, "file/template length mismatch");
180     }
181     return;
182 }
183 
184 
Compare(istream & file_stm,istream & template_stm)185 void CRegexpTemplateTester::Compare(istream& file_stm, istream& template_stm)
186 {
187     x_Reset();
188     if (x_Compare(file_stm, template_stm) == eStop) {
189         return;
190     }
191     // Compare number of lines in both streams
192     if (x_GetLine(file_stm, eFile)) {
193         ERROR_TEMPLATE(eMismatchLength, "stream/template length mismatch");
194     }
195     return;
196 }
197 
198 
x_Compare(istream & file_stm,istream & template_stm)199 CRegexpTemplateTester::EResult CRegexpTemplateTester::x_Compare(istream& file_stm, istream& template_stm)
200 {
201     while (x_GetLine(template_stm, eTemplate))
202     {
203         CTempString str(m_TemplateLine);
204 
205         // Comment line
206         if (NStr::StartsWith(str, m_CommentStart)) {
207             continue;
208         }
209         // Operations
210         if (NStr::StartsWith(str, m_OpStart)) {
211             str = NStr::TruncateSpaces_Unsafe(str.substr(m_OpStart.length()));
212 
213             // stop
214             if (str == "stop") {
215                 return eStop;
216             }
217             // set var=value
218             if (NStr::StartsWith(str, "set ")) {
219                 x_Op_Set(str);
220                 continue;
221             }
222             // echo <string>
223             if (NStr::StartsWith(str, "echo ")) {
224                 x_Op_Echo(str);
225                 continue;
226             }
227             // test <expression>
228             if (NStr::StartsWith(str, "test ")) {
229                 x_Op_Test(str);
230                 continue;
231             }
232             // include <path>
233             if (NStr::StartsWith(str, "include ")) {
234                 x_Op_Include(str, file_stm);
235                 continue;
236             }
237             // skip <expression>
238             if (NStr::StartsWith(str, "skip ")) {
239                 x_Op_Skip(str, file_stm);
240                 continue;
241             }
242             SIZE_TYPE len = str.find(' ');
243             ERROR_TEMPLATE(eOpUnknown, "unknown operation '" + string(str.substr(0, len)) + "'");
244             // unreachable
245         }
246 
247         if (!m_ReprocessFileLine) {
248             // Get next line from the file
249             if (!x_GetLine(file_stm, eFile)) {
250                 ERROR_TEMPLATE(eMismatchLength, "file/template length mismatch");
251             }
252             ++m_FileLineNum;
253         }
254         m_ReprocessFileLine = false;
255 
256         // Compare
257         x_CompareLines(m_FileLine, str);
258     }
259     return eTemplateEOF;
260 }
261 
262 
x_GetLine(istream & is,ESource src)263 istream& CRegexpTemplateTester::x_GetLine(istream& is, ESource src)
264 {
265     string*     str = NULL;
266     SIZE_TYPE*  num = NULL;
267     bool skip_empty = false;;
268 
269     switch(src) {
270         case eFile:
271             str = &m_FileLine;
272             num = &m_FileLineNum;
273             skip_empty = (m_Flags & fSkipEmptySourceLines) > 0;
274             break;
275         case eTemplate:
276             str = &m_TemplateLine;
277             num = &m_TemplateLineNum;
278             skip_empty = (m_Flags & fSkipEmptyTemplateLines) > 0;
279             break;
280         default:
281             _TROUBLE;
282     }
283 
284     while (NcbiGetline(is, *str, m_EOLs)) {
285         (*num)++;
286         if (!skip_empty  ||  !str->empty()) {
287             break;
288         }
289     }
290     return is;
291 }
292 
293 
x_CompareLines(CTempString file_line,CTempString template_line)294 void CRegexpTemplateTester::x_CompareLines(CTempString file_line, CTempString template_line)
295 {
296     // Check template line on variables and substitute
297 
298     // Substitute known variables (constants) with values
299     string s = x_SubstituteVars(template_line, NULL);
300     // Now, replace inline variable definitions with regexp counterparts
301     TVarList inline_vars;
302     s = x_SubstituteVars(s, &inline_vars);
303 
304     // Compare lines
305 
306     CRegexp re(s);
307     if ( !re.IsMatch(file_line) ) {
308         ERROR_TEMPLATE(eMismatchContent, "content mismatch");
309     }
310     if ( inline_vars.size() != (SIZE_TYPE)(re.NumFound()-1) ) {
311         ERROR_TEMPLATE(eMismatchContent, "cannot match all variables");
312     }
313     // Convert sub-patterns to variables
314     int i = 1;
315     ITERATE(TVarList, it, inline_vars) {
316         m_Vars[*it] = re.GetSub(file_line, i++);
317     }
318     return;
319 }
320 
321 
x_ParseVar(CTempString str,SIZE_TYPE pos) const322 SIZE_TYPE CRegexpTemplateTester::x_ParseVar(CTempString str, SIZE_TYPE pos) const
323 {
324     SIZE_TYPE len = str.length();
325     pos += m_VarStart.length();
326     if (pos >= len) {
327         return NPOS;
328     }
329 
330     int counter = 1;
331 
332     for (; pos <= len - m_VarEnd.length(); pos++) {
333         if (NStr::CompareCase(str, pos, m_VarStart.length(), m_VarStart) == 0) {
334             counter++;
335         } else
336         if (NStr::CompareCase(str, pos, m_VarEnd.length(), m_VarEnd) == 0) {
337             counter--;
338         }
339         if (counter == 0) {
340             return pos;
341         }
342     }
343     return NPOS;
344 }
345 
346 
x_ParseVarName(CTempString str,SIZE_TYPE pos) const347 SIZE_TYPE CRegexpTemplateTester::x_ParseVarName(CTempString str, SIZE_TYPE pos) const
348 {
349     SIZE_TYPE len = str.length();
350     if (pos >= len) {
351         return NPOS;
352     }
353     _ASSERT(!isspace((unsigned char)str[pos]));
354     if (!isalpha((unsigned char)str[pos])) {
355         ERROR_TEMPLATE(eVarErr, "variable name should start with alphabetic symbol");
356     }
357     SIZE_TYPE start = pos;
358     ++pos;
359     while (pos < len  &&
360            (isalnum((unsigned char)str[pos])  ||  str[pos] == '-'  ||  str[pos] == '_') ) ++pos;
361     return pos - start;
362 }
363 
364 
x_SubstituteVars(CTempString str,TVarList * inline_vars) const365 string CRegexpTemplateTester::x_SubstituteVars(CTempString str, TVarList* inline_vars) const
366 {
367     SIZE_TYPE start = NStr::Find(str, m_VarStart);
368     if (start == NPOS) {
369         return str;
370     }
371     string out;
372     bool   out_have_value = false;  // allow to process empty variables
373     SIZE_TYPE last = 0;
374     do {
375         SIZE_TYPE end = x_ParseVar(str, start);
376         if (end == NPOS  ||  end == start) {
377             ERROR_TEMPLATE(eVarErr, "cannot find closing tag for variable definition");
378         }
379         // Get variable definition
380         CTempString vdef = str.substr(start + m_VarStart.length(),
381                                       end - start - m_VarStart.length());
382 
383         // If variable definition looks like like "var=value", replace it with
384         // regexp sub-pattern "(value)" and add "var" to 'inline_vars' list.
385         // Otherwise, just substitute variable with its value.
386 
387         SIZE_TYPE eqpos = vdef.find('=');
388 
389         if (inline_vars || (!inline_vars  &&  eqpos == NPOS)) {
390             // Append string before variable definition
391             if (start - last) {
392                 out.append(str.data() + last, start - last);
393                 out_have_value = true;
394             }
395             last = end + m_VarEnd.length();
396         }
397         if (eqpos == NPOS) {
398             // Replace variable with its value
399             string s = x_SubstituteVars(vdef, NULL);
400             out.append(GetVar(s));
401             out_have_value = true;
402         } else
403 
404         if (inline_vars) {
405             // Get variable name
406             SIZE_TYPE n = x_ParseVarName(vdef, 0);
407             CTempString inline_name = vdef.substr(0, n);
408             if ( !n || (inline_name.length() != eqpos)) {
409                 ERROR_TEMPLATE(eVarErr, "wrong variable definition");
410             }
411             CTempString inline_val = vdef.substr(eqpos + 1);
412             if (inline_val.empty()) {
413                 ERROR_TEMPLATE(eVarErr, "variable definition cannot have empty value");
414             }
415             string s = x_SubstituteVars(inline_val, NULL);
416             // Replace with subpattern
417             out.append("(" + s + ")");
418             out_have_value = true;
419            inline_vars->push_back(inline_name);
420         }
421         // Find next variable
422         start = NStr::Find(str, m_VarStart, end + m_VarEnd.length());
423     }
424     while ( start != NPOS );
425 
426     if (out.empty() && !out_have_value) {
427         return str;
428     }
429     // append string tail
430     if (last < str.length()) {
431         out.append(str.data() + last, str.length() - last);
432     }
433     // return result
434     return out;
435 }
436 
437 
438 // Skip white spaces
439 #define SKIP_SPACES \
440         while (i < len  &&  isspace((unsigned char)str[i])) ++i
441 
442 
x_Op_Set(CTempString str)443 void CRegexpTemplateTester::x_Op_Set(CTempString str)
444 {
445     SIZE_TYPE len = str.length();
446     SIZE_TYPE i = 4; // length of "set "
447     _ASSERT(i < len);
448 
449     SKIP_SPACES;
450     SIZE_TYPE n = x_ParseVarName(str, i);
451     if ( !n ) {
452         ERROR_TEMPLATE(eOpErr, "SET: variable not specified");
453     }
454     string name = str.substr(i, n);
455     i += n;
456     if (str[i] != '=') {
457         ERROR_TEMPLATE(eOpErr, "SET: no assignment operator");
458     }
459     if (i >= len) {
460         ERROR_TEMPLATE(eOpErr, "SET: variable cannot have empty value");
461     }
462     // Substitute all variables with its values
463     m_Vars[name] = x_SubstituteVars(str.substr(++i), NULL);
464 }
465 
466 
x_Op_Echo(CTempString str)467 void CRegexpTemplateTester::x_Op_Echo(CTempString str)
468 {
469     SIZE_TYPE len = str.length();
470     SIZE_TYPE i = 5; // length of "echo "
471     SKIP_SPACES;
472     string s = x_SubstituteVars(str.substr(i), NULL);
473     cout << s << endl;
474 }
475 
476 
x_Op_Test(CTempString str)477 void CRegexpTemplateTester::x_Op_Test(CTempString str)
478 {
479     enum ETestOp {
480         eEqual,
481         eNotEqual,
482         eMatch,
483         eNotMatch
484     };
485     SIZE_TYPE len = str.length();
486     SIZE_TYPE i = 5; // length of "test "
487     _ASSERT(i < len);
488 
489     SKIP_SPACES;
490 
491     ETestOp   op;
492     SIZE_TYPE pop;
493 
494     if ((pop = NStr::Find(str, "==", i)) != NPOS) {
495         op = eEqual;
496     } else
497     if ((pop = NStr::Find(str, "!=", i)) != NPOS) {
498         op = eNotEqual;
499     } else
500     if ((pop = NStr::Find(str, "=~", i)) != NPOS) {
501         op = eMatch;
502     } else
503     if ((pop = NStr::Find(str, "!~", i)) != NPOS) {
504         op = eNotMatch;
505     } else {
506         ERROR_TEMPLATE(eOpErr, "TEST: no comparison operator");
507     }
508     string sop = str.substr(pop, 2);
509     string s1 = x_SubstituteVars(str.substr(i, pop - i), NULL);
510     string s2 = x_SubstituteVars(str.substr(pop+2), NULL);
511 
512     bool res;
513     switch (op) {
514         case eEqual:
515             res = (s1 == s2);
516             break;
517         case eNotEqual:
518             res = (s1 != s2);
519             break;
520         case eMatch:
521         case eNotMatch:
522             {{
523                 CRegexp re(s2);
524                 if (re.IsMatch(s1)) {
525                     res = (op == eMatch);
526                 } else {
527                     res = (op == eNotMatch);
528                 }
529             }}
530             break;
531     }
532     if (!res) {
533         ERROR_TEMPLATE(eOpTest, "TEST: result of comparison is FALSE: '" +
534                                 NStr::PrintableString(s1) + "' " + sop + " '" +
535                                 NStr::PrintableString(s2) + "'");
536     }
537     return;
538 }
539 
540 
x_Op_Include(CTempString str,istream & file_stm)541 void CRegexpTemplateTester::x_Op_Include(CTempString str, istream& file_stm)
542 {
543     SIZE_TYPE len = str.length();
544     SIZE_TYPE i = 8; // length of "include "
545     _ASSERT(i < len);
546 
547     SKIP_SPACES;
548 
549     string path;
550     CFile::SplitPath(m_TemplateName, &path);
551     path = CFile::ConcatPath(path, str.substr(i));
552 
553     CNcbiIfstream template_stm(path.c_str());
554     if (!template_stm.good()) {
555         ERROR_TEMPLATE(eOpErr, "INCLUDE: cannot open file: " + path);
556     }
557 
558     // Save context for current template
559     string    saved_name = m_TemplateName;
560     SIZE_TYPE saved_line = m_TemplateLineNum;
561 
562     // Reset
563     m_TemplateName = path;
564     m_TemplateLineNum = 0;
565 
566     // Continue comparing
567     x_Compare(file_stm, template_stm);
568 
569     // Restore saved context
570     m_TemplateName = saved_name;
571     m_TemplateLineNum = saved_line;
572 
573     return;
574 }
575 
576 
x_Op_Skip(CTempString str,istream & file_stm)577 void CRegexpTemplateTester::x_Op_Skip(CTempString str, istream& file_stm)
578 {
579     enum ESkipOp {
580         eNumber,
581         eWhile,
582         eUntil
583     };
584     SIZE_TYPE len = str.length();
585     SIZE_TYPE i = 5; // length of "skip "
586     _ASSERT(i < len);
587 
588     SKIP_SPACES;
589     CTempString op_str  = str.substr(i);
590     CTempString op_name;
591 
592     ESkipOp     op;
593     SIZE_TYPE   op_num = 0;
594     CTempString re_str;
595 
596     if (NStr::StartsWith(op_str, "while")) {
597         // # skip while <regexp>
598         op = eWhile;
599         op_name = op_str.substr(0, 5);
600         i += 5;
601         if (str[i] == ' ') {
602             SKIP_SPACES;
603             re_str = str.substr(i);
604         }
605     }
606     else
607     if (NStr::StartsWith(op_str, "until")) {
608         // # skip until <regexp>
609         op = eUntil;
610         op_name = op_str.substr(0, 5);
611         i += 5;
612         if (str[i] == ' ') {
613             SKIP_SPACES;
614             re_str = str.substr(i);
615         }
616     }
617     else {
618         // # skip <num>
619         op = eNumber;
620         try {
621             op_num = NStr::StringToNumeric<SIZE_TYPE>(op_str);
622         }
623         catch (CStringException&) {}
624         if (!op_num) {
625             ERROR_TEMPLATE(eOpErr, "SKIP: expected number of lines");
626         }
627     }
628 
629     AutoPtr<CRegexp> re;
630     string re_pattern;
631 
632     if (op == eWhile || op == eUntil) {
633         re_pattern = x_SubstituteVars(re_str, NULL);
634         if (re_str.empty()) {
635             ERROR_TEMPLATE(eOpErr, "SKIP: expected regular expression after '" +
636                            (string)op_name + "'");
637         }
638         re.reset(new CRegexp(re_pattern));
639     }
640 
641     // Skip lines in the input data
642 
643     SIZE_TYPE num_lines = 0;
644     bool      matched = false;
645 
646     while (x_GetLine(file_stm, eFile))
647     {
648         ++num_lines;
649 
650         if (op == eNumber) {
651             if (num_lines == op_num) {
652                 break;
653             }
654         } else {
655             matched = re->IsMatch(m_FileLine);
656             if (op == eWhile) {
657                 // revert
658                 matched = !matched;
659             }
660             if (matched) {
661                 break;
662             }
663         }
664     }
665 
666     // Check that we skipped all requested number of lines or
667     // found the line matched (or not) to the regular expression.
668     if (op == eNumber) {
669         if (num_lines != op_num) {
670             ERROR_TEMPLATE(eMismatchLength, "SKIP: unable to skip " +
671                            NStr::NumericToString(num_lines ) + " lines");
672         }
673     } else {
674         if (!matched) {
675             ERROR_TEMPLATE(eMismatchLength, "SKIP: unable to skip " +
676                            (string)op_name + " '" +
677                            NStr::PrintableString(re_pattern) + "'");
678         }
679         // For the while/until cases we should be able to process current
680         // 'stopper' file line also, it can be compared with the next template
681         // 'data' line.
682         m_ReprocessFileLine = true;
683     }
684     return;
685 }
686 
687 
688 
689 /////////////////////////////////////////////////////////////////////////////
690 ///
691 /// CRegexpTemplateTesterException --
692 ///
693 
GetErrCodeString(void) const694 const char* CRegexpTemplateTesterException::GetErrCodeString(void) const
695 {
696     switch (GetErrCode()) {
697         case eOpenFile:         return "eOpenFile";
698         case eMismatchLength:   return "eMismatchLength";
699         case eMismatchContent:  return "eMismatchContent";
700         case eVarNotFound:      return "eVarNotFound";
701         case eVarErr:           return "eVarErr";
702         case eOpUnknown:        return "eOpUnknown";
703         case eOpErr:            return "eOpErr";
704         case eOpTest:           return "eOpTest";
705         default:                return CException::GetErrCodeString();
706     }
707 }
708 
709 
710 END_NCBI_SCOPE
711