1 /*$Id: regexp_template_tester.cpp 467289 2015-05-12 16:23:41Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Vladimir Ivanov
27 *
28 * File Description:
29 * Regexp template tester based on the Perl Compatible Regular
30 * Expression (pcre) library. Allow to test delimited text data
31 * against template with regular expression rules.
32 *
33 */
34
35
36 #include <ncbi_pch.hpp>
37 #include <corelib/ncbifile.hpp>
38 #include <corelib/tempstr.hpp>
39 #include <util/xregexp/regexp.hpp>
40 #include <util/xregexp/regexp_template_tester.hpp>
41
42
43 BEGIN_NCBI_SCOPE
44
45
46 //////////////////////////////////////////////////////////////////////////////
47 //
48 // Macros for throwing an exceptions on error
49 //
50
51 #define FILE_NAME(name) \
52 (name.empty() ? string("(stream)") : name)
53
54 #define ERROR_FILE(errcode, message, file) \
55 NCBI_THROW(CRegexpTemplateTesterException, errcode, \
56 FILE_NAME(file) + " -- " + message)
57
58 #define ERROR_TEMPLATE(errcode, message) \
59 NCBI_THROW(CRegexpTemplateTesterException, errcode, "\n" + \
60 FILE_NAME(m_FileName) + "(" + \
61 NStr::NumericToString(m_FileLineNum) + ")\n" + \
62 FILE_NAME(m_TemplateName) + "(" + \
63 NStr::NumericToString(m_TemplateLineNum) + ")\n-- " + \
64 message)
65
66
67
68 /////////////////////////////////////////////////////////////////////////////
69 ///
70 /// CRegexpTemplateTester --
71 ///
72
73
CRegexpTemplateTester(TFlags flags)74 CRegexpTemplateTester::CRegexpTemplateTester(TFlags flags)
75 : m_Flags(flags),
76 m_VarStart("${"),
77 m_VarEnd("}"),
78 m_OpStart("#"),
79 m_CommentStart("//"),
80 m_EOLs("\r\n")
81 {
82 x_Reset();
83 }
84
85
x_Reset()86 void CRegexpTemplateTester::x_Reset()
87 {
88 // File/template
89 m_FileName.clear();
90 m_FileLineNum = 0;
91 m_TemplateName.clear();
92 m_TemplateLineNum = 0;
93
94 // Variables
95 m_Vars.clear();
96
97 // Cached line
98 m_FileLine.clear();
99 m_ReprocessFileLine = false;
100 }
101
102
SetVarScope(string & start,string & end)103 void CRegexpTemplateTester::SetVarScope(string& start, string& end)
104 {
105 m_VarStart = start;
106 m_VarEnd = end;
107 }
108
109
SetCommentStart(string & str)110 void CRegexpTemplateTester::SetCommentStart(string& str)
111 {
112 m_CommentStart = str;
113 }
114
115
SetCommandStart(string & str)116 void CRegexpTemplateTester::SetCommandStart(string& str)
117 {
118 m_OpStart = str;
119 }
120
121
SetDelimiters(string & str)122 void CRegexpTemplateTester::SetDelimiters(string& str)
123 {
124 m_EOLs = str;
125 }
126
127
GetVar(const string & name) const128 string CRegexpTemplateTester::GetVar(const string& name) const
129 {
130 TVarMap::const_iterator var = m_Vars.find(name);
131 if (var == m_Vars.end()) {
132 ERROR_TEMPLATE(eVarNotFound, string("variable '") + name + "' is not defined");
133 }
134 return var->second;
135 }
136
137
GetVars(void) const138 const CRegexpTemplateTester::TVarMap& CRegexpTemplateTester::GetVars(void) const
139 {
140 return m_Vars;
141 }
142
143
PrintVar(const string & name) const144 void CRegexpTemplateTester::PrintVar(const string& name) const
145 {
146 string value = GetVar(name);
147 cout << name << " = " << NStr::PrintableString(value) << endl;
148 }
149
150
PrintVars(void) const151 void CRegexpTemplateTester::PrintVars(void) const
152 {
153 ITERATE(TVarMap, it, m_Vars) {
154 cout << it->first << " = " << NStr::PrintableString(it->second) << endl;
155 }
156 }
157
158
Compare(const string & file_path,const string & template_path)159 void CRegexpTemplateTester::Compare(const string& file_path, const string& template_path)
160 {
161 x_Reset();
162 m_FileName = file_path;
163 m_TemplateName = template_path;
164
165 CNcbiIfstream file_stm(file_path.c_str());
166 if (!file_stm.good()) {
167 ERROR_FILE(eOpenFile, "cannot open file", file_path);
168 }
169 CNcbiIfstream template_stm(template_path.c_str());
170 if (!template_stm.good()) {
171 ERROR_FILE(eOpenFile, "cannot open file", template_path);
172 }
173 // Compare
174 if ( x_Compare(file_stm, template_stm) == eStop ) {
175 return;
176 }
177 // Compare number of lines in both files
178 if (x_GetLine(file_stm, eFile)) {
179 ERROR_TEMPLATE(eMismatchLength, "file/template length mismatch");
180 }
181 return;
182 }
183
184
Compare(istream & file_stm,istream & template_stm)185 void CRegexpTemplateTester::Compare(istream& file_stm, istream& template_stm)
186 {
187 x_Reset();
188 if (x_Compare(file_stm, template_stm) == eStop) {
189 return;
190 }
191 // Compare number of lines in both streams
192 if (x_GetLine(file_stm, eFile)) {
193 ERROR_TEMPLATE(eMismatchLength, "stream/template length mismatch");
194 }
195 return;
196 }
197
198
x_Compare(istream & file_stm,istream & template_stm)199 CRegexpTemplateTester::EResult CRegexpTemplateTester::x_Compare(istream& file_stm, istream& template_stm)
200 {
201 while (x_GetLine(template_stm, eTemplate))
202 {
203 CTempString str(m_TemplateLine);
204
205 // Comment line
206 if (NStr::StartsWith(str, m_CommentStart)) {
207 continue;
208 }
209 // Operations
210 if (NStr::StartsWith(str, m_OpStart)) {
211 str = NStr::TruncateSpaces_Unsafe(str.substr(m_OpStart.length()));
212
213 // stop
214 if (str == "stop") {
215 return eStop;
216 }
217 // set var=value
218 if (NStr::StartsWith(str, "set ")) {
219 x_Op_Set(str);
220 continue;
221 }
222 // echo <string>
223 if (NStr::StartsWith(str, "echo ")) {
224 x_Op_Echo(str);
225 continue;
226 }
227 // test <expression>
228 if (NStr::StartsWith(str, "test ")) {
229 x_Op_Test(str);
230 continue;
231 }
232 // include <path>
233 if (NStr::StartsWith(str, "include ")) {
234 x_Op_Include(str, file_stm);
235 continue;
236 }
237 // skip <expression>
238 if (NStr::StartsWith(str, "skip ")) {
239 x_Op_Skip(str, file_stm);
240 continue;
241 }
242 SIZE_TYPE len = str.find(' ');
243 ERROR_TEMPLATE(eOpUnknown, "unknown operation '" + string(str.substr(0, len)) + "'");
244 // unreachable
245 }
246
247 if (!m_ReprocessFileLine) {
248 // Get next line from the file
249 if (!x_GetLine(file_stm, eFile)) {
250 ERROR_TEMPLATE(eMismatchLength, "file/template length mismatch");
251 }
252 ++m_FileLineNum;
253 }
254 m_ReprocessFileLine = false;
255
256 // Compare
257 x_CompareLines(m_FileLine, str);
258 }
259 return eTemplateEOF;
260 }
261
262
x_GetLine(istream & is,ESource src)263 istream& CRegexpTemplateTester::x_GetLine(istream& is, ESource src)
264 {
265 string* str = NULL;
266 SIZE_TYPE* num = NULL;
267 bool skip_empty = false;;
268
269 switch(src) {
270 case eFile:
271 str = &m_FileLine;
272 num = &m_FileLineNum;
273 skip_empty = (m_Flags & fSkipEmptySourceLines) > 0;
274 break;
275 case eTemplate:
276 str = &m_TemplateLine;
277 num = &m_TemplateLineNum;
278 skip_empty = (m_Flags & fSkipEmptyTemplateLines) > 0;
279 break;
280 default:
281 _TROUBLE;
282 }
283
284 while (NcbiGetline(is, *str, m_EOLs)) {
285 (*num)++;
286 if (!skip_empty || !str->empty()) {
287 break;
288 }
289 }
290 return is;
291 }
292
293
x_CompareLines(CTempString file_line,CTempString template_line)294 void CRegexpTemplateTester::x_CompareLines(CTempString file_line, CTempString template_line)
295 {
296 // Check template line on variables and substitute
297
298 // Substitute known variables (constants) with values
299 string s = x_SubstituteVars(template_line, NULL);
300 // Now, replace inline variable definitions with regexp counterparts
301 TVarList inline_vars;
302 s = x_SubstituteVars(s, &inline_vars);
303
304 // Compare lines
305
306 CRegexp re(s);
307 if ( !re.IsMatch(file_line) ) {
308 ERROR_TEMPLATE(eMismatchContent, "content mismatch");
309 }
310 if ( inline_vars.size() != (SIZE_TYPE)(re.NumFound()-1) ) {
311 ERROR_TEMPLATE(eMismatchContent, "cannot match all variables");
312 }
313 // Convert sub-patterns to variables
314 int i = 1;
315 ITERATE(TVarList, it, inline_vars) {
316 m_Vars[*it] = re.GetSub(file_line, i++);
317 }
318 return;
319 }
320
321
x_ParseVar(CTempString str,SIZE_TYPE pos) const322 SIZE_TYPE CRegexpTemplateTester::x_ParseVar(CTempString str, SIZE_TYPE pos) const
323 {
324 SIZE_TYPE len = str.length();
325 pos += m_VarStart.length();
326 if (pos >= len) {
327 return NPOS;
328 }
329
330 int counter = 1;
331
332 for (; pos <= len - m_VarEnd.length(); pos++) {
333 if (NStr::CompareCase(str, pos, m_VarStart.length(), m_VarStart) == 0) {
334 counter++;
335 } else
336 if (NStr::CompareCase(str, pos, m_VarEnd.length(), m_VarEnd) == 0) {
337 counter--;
338 }
339 if (counter == 0) {
340 return pos;
341 }
342 }
343 return NPOS;
344 }
345
346
x_ParseVarName(CTempString str,SIZE_TYPE pos) const347 SIZE_TYPE CRegexpTemplateTester::x_ParseVarName(CTempString str, SIZE_TYPE pos) const
348 {
349 SIZE_TYPE len = str.length();
350 if (pos >= len) {
351 return NPOS;
352 }
353 _ASSERT(!isspace((unsigned char)str[pos]));
354 if (!isalpha((unsigned char)str[pos])) {
355 ERROR_TEMPLATE(eVarErr, "variable name should start with alphabetic symbol");
356 }
357 SIZE_TYPE start = pos;
358 ++pos;
359 while (pos < len &&
360 (isalnum((unsigned char)str[pos]) || str[pos] == '-' || str[pos] == '_') ) ++pos;
361 return pos - start;
362 }
363
364
x_SubstituteVars(CTempString str,TVarList * inline_vars) const365 string CRegexpTemplateTester::x_SubstituteVars(CTempString str, TVarList* inline_vars) const
366 {
367 SIZE_TYPE start = NStr::Find(str, m_VarStart);
368 if (start == NPOS) {
369 return str;
370 }
371 string out;
372 bool out_have_value = false; // allow to process empty variables
373 SIZE_TYPE last = 0;
374 do {
375 SIZE_TYPE end = x_ParseVar(str, start);
376 if (end == NPOS || end == start) {
377 ERROR_TEMPLATE(eVarErr, "cannot find closing tag for variable definition");
378 }
379 // Get variable definition
380 CTempString vdef = str.substr(start + m_VarStart.length(),
381 end - start - m_VarStart.length());
382
383 // If variable definition looks like like "var=value", replace it with
384 // regexp sub-pattern "(value)" and add "var" to 'inline_vars' list.
385 // Otherwise, just substitute variable with its value.
386
387 SIZE_TYPE eqpos = vdef.find('=');
388
389 if (inline_vars || (!inline_vars && eqpos == NPOS)) {
390 // Append string before variable definition
391 if (start - last) {
392 out.append(str.data() + last, start - last);
393 out_have_value = true;
394 }
395 last = end + m_VarEnd.length();
396 }
397 if (eqpos == NPOS) {
398 // Replace variable with its value
399 string s = x_SubstituteVars(vdef, NULL);
400 out.append(GetVar(s));
401 out_have_value = true;
402 } else
403
404 if (inline_vars) {
405 // Get variable name
406 SIZE_TYPE n = x_ParseVarName(vdef, 0);
407 CTempString inline_name = vdef.substr(0, n);
408 if ( !n || (inline_name.length() != eqpos)) {
409 ERROR_TEMPLATE(eVarErr, "wrong variable definition");
410 }
411 CTempString inline_val = vdef.substr(eqpos + 1);
412 if (inline_val.empty()) {
413 ERROR_TEMPLATE(eVarErr, "variable definition cannot have empty value");
414 }
415 string s = x_SubstituteVars(inline_val, NULL);
416 // Replace with subpattern
417 out.append("(" + s + ")");
418 out_have_value = true;
419 inline_vars->push_back(inline_name);
420 }
421 // Find next variable
422 start = NStr::Find(str, m_VarStart, end + m_VarEnd.length());
423 }
424 while ( start != NPOS );
425
426 if (out.empty() && !out_have_value) {
427 return str;
428 }
429 // append string tail
430 if (last < str.length()) {
431 out.append(str.data() + last, str.length() - last);
432 }
433 // return result
434 return out;
435 }
436
437
438 // Skip white spaces
439 #define SKIP_SPACES \
440 while (i < len && isspace((unsigned char)str[i])) ++i
441
442
x_Op_Set(CTempString str)443 void CRegexpTemplateTester::x_Op_Set(CTempString str)
444 {
445 SIZE_TYPE len = str.length();
446 SIZE_TYPE i = 4; // length of "set "
447 _ASSERT(i < len);
448
449 SKIP_SPACES;
450 SIZE_TYPE n = x_ParseVarName(str, i);
451 if ( !n ) {
452 ERROR_TEMPLATE(eOpErr, "SET: variable not specified");
453 }
454 string name = str.substr(i, n);
455 i += n;
456 if (str[i] != '=') {
457 ERROR_TEMPLATE(eOpErr, "SET: no assignment operator");
458 }
459 if (i >= len) {
460 ERROR_TEMPLATE(eOpErr, "SET: variable cannot have empty value");
461 }
462 // Substitute all variables with its values
463 m_Vars[name] = x_SubstituteVars(str.substr(++i), NULL);
464 }
465
466
x_Op_Echo(CTempString str)467 void CRegexpTemplateTester::x_Op_Echo(CTempString str)
468 {
469 SIZE_TYPE len = str.length();
470 SIZE_TYPE i = 5; // length of "echo "
471 SKIP_SPACES;
472 string s = x_SubstituteVars(str.substr(i), NULL);
473 cout << s << endl;
474 }
475
476
x_Op_Test(CTempString str)477 void CRegexpTemplateTester::x_Op_Test(CTempString str)
478 {
479 enum ETestOp {
480 eEqual,
481 eNotEqual,
482 eMatch,
483 eNotMatch
484 };
485 SIZE_TYPE len = str.length();
486 SIZE_TYPE i = 5; // length of "test "
487 _ASSERT(i < len);
488
489 SKIP_SPACES;
490
491 ETestOp op;
492 SIZE_TYPE pop;
493
494 if ((pop = NStr::Find(str, "==", i)) != NPOS) {
495 op = eEqual;
496 } else
497 if ((pop = NStr::Find(str, "!=", i)) != NPOS) {
498 op = eNotEqual;
499 } else
500 if ((pop = NStr::Find(str, "=~", i)) != NPOS) {
501 op = eMatch;
502 } else
503 if ((pop = NStr::Find(str, "!~", i)) != NPOS) {
504 op = eNotMatch;
505 } else {
506 ERROR_TEMPLATE(eOpErr, "TEST: no comparison operator");
507 }
508 string sop = str.substr(pop, 2);
509 string s1 = x_SubstituteVars(str.substr(i, pop - i), NULL);
510 string s2 = x_SubstituteVars(str.substr(pop+2), NULL);
511
512 bool res;
513 switch (op) {
514 case eEqual:
515 res = (s1 == s2);
516 break;
517 case eNotEqual:
518 res = (s1 != s2);
519 break;
520 case eMatch:
521 case eNotMatch:
522 {{
523 CRegexp re(s2);
524 if (re.IsMatch(s1)) {
525 res = (op == eMatch);
526 } else {
527 res = (op == eNotMatch);
528 }
529 }}
530 break;
531 }
532 if (!res) {
533 ERROR_TEMPLATE(eOpTest, "TEST: result of comparison is FALSE: '" +
534 NStr::PrintableString(s1) + "' " + sop + " '" +
535 NStr::PrintableString(s2) + "'");
536 }
537 return;
538 }
539
540
x_Op_Include(CTempString str,istream & file_stm)541 void CRegexpTemplateTester::x_Op_Include(CTempString str, istream& file_stm)
542 {
543 SIZE_TYPE len = str.length();
544 SIZE_TYPE i = 8; // length of "include "
545 _ASSERT(i < len);
546
547 SKIP_SPACES;
548
549 string path;
550 CFile::SplitPath(m_TemplateName, &path);
551 path = CFile::ConcatPath(path, str.substr(i));
552
553 CNcbiIfstream template_stm(path.c_str());
554 if (!template_stm.good()) {
555 ERROR_TEMPLATE(eOpErr, "INCLUDE: cannot open file: " + path);
556 }
557
558 // Save context for current template
559 string saved_name = m_TemplateName;
560 SIZE_TYPE saved_line = m_TemplateLineNum;
561
562 // Reset
563 m_TemplateName = path;
564 m_TemplateLineNum = 0;
565
566 // Continue comparing
567 x_Compare(file_stm, template_stm);
568
569 // Restore saved context
570 m_TemplateName = saved_name;
571 m_TemplateLineNum = saved_line;
572
573 return;
574 }
575
576
x_Op_Skip(CTempString str,istream & file_stm)577 void CRegexpTemplateTester::x_Op_Skip(CTempString str, istream& file_stm)
578 {
579 enum ESkipOp {
580 eNumber,
581 eWhile,
582 eUntil
583 };
584 SIZE_TYPE len = str.length();
585 SIZE_TYPE i = 5; // length of "skip "
586 _ASSERT(i < len);
587
588 SKIP_SPACES;
589 CTempString op_str = str.substr(i);
590 CTempString op_name;
591
592 ESkipOp op;
593 SIZE_TYPE op_num = 0;
594 CTempString re_str;
595
596 if (NStr::StartsWith(op_str, "while")) {
597 // # skip while <regexp>
598 op = eWhile;
599 op_name = op_str.substr(0, 5);
600 i += 5;
601 if (str[i] == ' ') {
602 SKIP_SPACES;
603 re_str = str.substr(i);
604 }
605 }
606 else
607 if (NStr::StartsWith(op_str, "until")) {
608 // # skip until <regexp>
609 op = eUntil;
610 op_name = op_str.substr(0, 5);
611 i += 5;
612 if (str[i] == ' ') {
613 SKIP_SPACES;
614 re_str = str.substr(i);
615 }
616 }
617 else {
618 // # skip <num>
619 op = eNumber;
620 try {
621 op_num = NStr::StringToNumeric<SIZE_TYPE>(op_str);
622 }
623 catch (CStringException&) {}
624 if (!op_num) {
625 ERROR_TEMPLATE(eOpErr, "SKIP: expected number of lines");
626 }
627 }
628
629 AutoPtr<CRegexp> re;
630 string re_pattern;
631
632 if (op == eWhile || op == eUntil) {
633 re_pattern = x_SubstituteVars(re_str, NULL);
634 if (re_str.empty()) {
635 ERROR_TEMPLATE(eOpErr, "SKIP: expected regular expression after '" +
636 (string)op_name + "'");
637 }
638 re.reset(new CRegexp(re_pattern));
639 }
640
641 // Skip lines in the input data
642
643 SIZE_TYPE num_lines = 0;
644 bool matched = false;
645
646 while (x_GetLine(file_stm, eFile))
647 {
648 ++num_lines;
649
650 if (op == eNumber) {
651 if (num_lines == op_num) {
652 break;
653 }
654 } else {
655 matched = re->IsMatch(m_FileLine);
656 if (op == eWhile) {
657 // revert
658 matched = !matched;
659 }
660 if (matched) {
661 break;
662 }
663 }
664 }
665
666 // Check that we skipped all requested number of lines or
667 // found the line matched (or not) to the regular expression.
668 if (op == eNumber) {
669 if (num_lines != op_num) {
670 ERROR_TEMPLATE(eMismatchLength, "SKIP: unable to skip " +
671 NStr::NumericToString(num_lines ) + " lines");
672 }
673 } else {
674 if (!matched) {
675 ERROR_TEMPLATE(eMismatchLength, "SKIP: unable to skip " +
676 (string)op_name + " '" +
677 NStr::PrintableString(re_pattern) + "'");
678 }
679 // For the while/until cases we should be able to process current
680 // 'stopper' file line also, it can be compared with the next template
681 // 'data' line.
682 m_ReprocessFileLine = true;
683 }
684 return;
685 }
686
687
688
689 /////////////////////////////////////////////////////////////////////////////
690 ///
691 /// CRegexpTemplateTesterException --
692 ///
693
GetErrCodeString(void) const694 const char* CRegexpTemplateTesterException::GetErrCodeString(void) const
695 {
696 switch (GetErrCode()) {
697 case eOpenFile: return "eOpenFile";
698 case eMismatchLength: return "eMismatchLength";
699 case eMismatchContent: return "eMismatchContent";
700 case eVarNotFound: return "eVarNotFound";
701 case eVarErr: return "eVarErr";
702 case eOpUnknown: return "eOpUnknown";
703 case eOpErr: return "eOpErr";
704 case eOpTest: return "eOpTest";
705 default: return CException::GetErrCodeString();
706 }
707 }
708
709
710 END_NCBI_SCOPE
711