1 /*  $Id: regexp.cpp 625904 2021-02-22 13:32:46Z ivanov $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Vladimir Ivanov, Clifford Clausen
27  *
28  * File Description:
29  *     C++ wrappers for Perl Compatible Regular Expression (pcre) library
30  *
31  */
32 
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbi_limits.h>
35 #include <corelib/ncbistl.hpp>
36 #include <util/xregexp/regexp.hpp>
37 #include <pcre.h>
38 
39 #include <memory>
40 #include <stdlib.h>
41 
42 BEGIN_NCBI_SCOPE
43 
44 
45 //////////////////////////////////////////////////////////////////////////////
46 //
47 //  CRegexp
48 //
49 
50 // Regular expression meta characters
51 static char s_Special[] = ".?*+$^[](){}/\\|-";
52 
53 
54 // Macro to check bits
55 #define F_ISSET(flags, mask) ((flags & (mask)) == (mask))
56 
57 // Auxiliary functions to convert CRegexp flags to real flags.
s_GetRealCompileFlags(CRegexp::TCompile compile_flags)58 static int s_GetRealCompileFlags(CRegexp::TCompile compile_flags)
59 {
60     int flags = 0;
61 
62     if ( !compile_flags  &&
63          !F_ISSET(compile_flags, CRegexp::fCompile_default )) {
64         NCBI_THROW(CRegexpException, eBadFlags,
65                    "Bad regular expression compilation flags");
66     }
67     if ( F_ISSET(compile_flags, CRegexp::fCompile_ignore_case) ) {
68         flags |= PCRE_CASELESS;
69     }
70     if ( F_ISSET(compile_flags, CRegexp::fCompile_dotall) ) {
71         flags |= PCRE_DOTALL;
72     }
73     if ( F_ISSET(compile_flags, CRegexp::fCompile_newline) ) {
74         flags |= PCRE_MULTILINE;
75     }
76     if ( F_ISSET(compile_flags, CRegexp::fCompile_ungreedy) ) {
77         flags |= PCRE_UNGREEDY;
78     }
79     if ( F_ISSET(compile_flags, CRegexp::fCompile_extended) ) {
80         flags |= PCRE_EXTENDED;
81     }
82     return flags;
83 }
84 
s_GetRealMatchFlags(CRegexp::TMatch match_flags)85 static int s_GetRealMatchFlags(CRegexp::TMatch match_flags)
86 {
87     int flags = 0;
88 
89     if ( !match_flags  &&
90          !F_ISSET(match_flags, CRegexp::fMatch_default) ) {
91         NCBI_THROW(CRegexpException, eBadFlags,
92                    "Bad regular expression match flags");
93     }
94     if ( F_ISSET(match_flags, CRegexp::fMatch_not_begin) ) {
95         flags |= PCRE_NOTBOL;
96     }
97     if ( F_ISSET(match_flags, CRegexp::fMatch_not_end) ) {
98         flags |= PCRE_NOTEOL;
99     }
100     return flags;
101 }
102 
103 
CRegexp(CTempStringEx pattern,TCompile flags)104 CRegexp::CRegexp(CTempStringEx pattern, TCompile flags)
105     : m_PReg(NULL), m_Extra(NULL), m_NumFound(0)
106 {
107     Set(pattern, flags);
108 }
109 
110 
~CRegexp()111 CRegexp::~CRegexp()
112 {
113     (*pcre_free)(m_PReg);
114     (*pcre_free)(m_Extra);
115 }
116 
117 
Set(CTempStringEx pattern,TCompile flags)118 void CRegexp::Set(CTempStringEx pattern, TCompile flags)
119 {
120     if ( m_PReg ) {
121         (*pcre_free)(m_PReg);
122     }
123     const char *err;
124     int err_offset;
125     int x_flags = s_GetRealCompileFlags(flags);
126 
127     if ( pattern.HasZeroAtEnd() ) {
128         m_PReg = pcre_compile(pattern.data(), x_flags, &err, &err_offset, NULL);
129     } else {
130         m_PReg = pcre_compile(string(pattern).c_str(), x_flags, &err, &err_offset, NULL);
131     }
132     if ( !m_PReg ) {
133         NCBI_THROW(CRegexpException, eCompile, "Compilation of the pattern '" +
134                    string(pattern) + "' failed: " + err);
135     }
136     if ( m_Extra ) {
137         (*pcre_free)(m_Extra);
138     }
139     m_Extra = pcre_study((pcre*)m_PReg, 0, &err);
140 }
141 
142 
143 // @deprecated
GetSub(CTempString str,size_t idx,string & dst) const144 void CRegexp::GetSub(CTempString str, size_t idx, string& dst) const
145 {
146     if ( (int)idx >= m_NumFound ) {
147         dst.erase();
148         return;
149     }
150     int start = m_Results[2 * idx];
151     int end   = m_Results[2 * idx + 1];
152     if (start == -1  ||  end == -1) {
153         dst.erase();
154     } else {
155         dst.assign(str.data() + start, end - start);
156     }
157 }
158 
159 
GetSub(CTempString str,size_t idx) const160 CTempString CRegexp::GetSub(CTempString str, size_t idx) const
161 {
162     if ( (int)idx >= m_NumFound ) {
163         return CTempString();
164     }
165     int start = m_Results[2 * idx];
166     int end   = m_Results[2 * idx + 1];
167     if (start == -1  ||  end == -1) {
168         return CTempString();
169     }
170     return CTempString(str.data() + start, end - start);
171 }
172 
173 
GetMatch(CTempString str,size_t offset,size_t idx,TMatch flags,bool noreturn)174 CTempString CRegexp::GetMatch(CTempString str, size_t offset, size_t idx,
175                               TMatch flags, bool noreturn)
176 {
177     int x_flags = s_GetRealMatchFlags(flags);
178     m_NumFound = pcre_exec((pcre*)m_PReg, (pcre_extra*)m_Extra, str.data(),
179                            (int)str.length(), (int)offset,
180                            x_flags, m_Results,
181                            (int)(kRegexpMaxSubPatterns +1) * 3);
182     if ( noreturn ) {
183         return CTempString();
184     }
185     return GetSub(str, idx);
186 }
187 
188 
IsMatch(CTempString str,TMatch flags)189 bool CRegexp::IsMatch(CTempString str, TMatch flags)
190 {
191     int x_flags = s_GetRealMatchFlags(flags);
192     m_NumFound = pcre_exec((pcre*)m_PReg, (pcre_extra*)m_Extra, str.data(),
193                            (int)str.length(), 0, x_flags, m_Results,
194                            (int)(kRegexpMaxSubPatterns +1) * 3);
195     return m_NumFound > 0;
196 }
197 
198 
Escape(CTempString str)199 string CRegexp::Escape(CTempString str)
200 {
201     // Find first special character
202     SIZE_TYPE prev = 0;
203     SIZE_TYPE pos = str.find_first_of(s_Special, prev);
204     if ( pos == NPOS ) {
205         // All characters are good - return original string
206         return str;
207     }
208     CNcbiOstrstream out;
209     do {
210         // Write first good characters in one chunk
211         out.write(str.data() + prev, pos - prev);
212         // Escape char
213         out.put('\\');
214         out.put(str[pos]);
215         // Find next
216         prev = pos + 1;
217         pos = str.find_first_of(s_Special, prev);
218     } while (pos != NPOS);
219 
220     // Write remaining part of the string
221     out.write(str.data() + prev, str.length() - prev);
222     // Return encoded string
223     return CNcbiOstrstreamToString(out);
224 }
225 
226 
WildcardToRegexp(CTempString mask)227 string CRegexp::WildcardToRegexp(CTempString mask)
228 {
229     // Find first special character
230     SIZE_TYPE prev = 0;
231     SIZE_TYPE pos = mask.find_first_of(s_Special, prev);
232     if ( pos == NPOS ) {
233         // All characters are good - return original string
234         return mask;
235     }
236     CNcbiOstrstream out;
237     do {
238         // Write first good characters in one chunk
239         out.write(mask.data() + prev, pos - prev);
240         // Convert or escape found character
241         if (mask[pos] == '*') {
242             out.put('.');
243             out.put(mask[pos]);
244         } else if (mask[pos] == '?') {
245             out.put('.');
246         } else {
247             // Escape character
248             out.put('\\');
249             out.put(mask[pos]);
250         }
251         // Find next
252         prev = pos + 1;
253         pos = mask.find_first_of(s_Special, prev);
254     } while (pos != NPOS);
255 
256     // Write remaining part of the string
257     out.write(mask.data() + prev, mask.length() - prev);
258     // Return encoded string
259     return CNcbiOstrstreamToString(out);
260 }
261 
262 
263 //////////////////////////////////////////////////////////////////////////////
264 //
265 //  CRegexpUtil
266 //
267 
CRegexpUtil(CTempString str)268 CRegexpUtil::CRegexpUtil(CTempString str)
269     : m_Delimiter("\n")
270 {
271     Reset(str);
272     return;
273 }
274 
275 
SetRange(CTempStringEx addr_start,CTempStringEx addr_end,CTempString delimiter)276 void CRegexpUtil::SetRange(
277     CTempStringEx addr_start,
278     CTempStringEx addr_end,
279     CTempString   delimiter)
280 {
281     m_RangeStart = addr_start;
282     m_RangeEnd   = addr_end;
283     m_Delimiter  = delimiter;
284     x_Divide(delimiter);
285 }
286 
287 
Replace(CTempStringEx search,CTempString replace,CRegexp::TCompile compile_flags,CRegexp::TMatch match_flags,size_t max_replace)288 size_t CRegexpUtil::Replace(
289     CTempStringEx     search,
290     CTempString       replace,
291     CRegexp::TCompile compile_flags,
292     CRegexp::TMatch   match_flags,
293     size_t            max_replace)
294 {
295     if ( search.empty() ) {
296         return 0;
297     }
298     size_t n_replace = 0;
299 
300     // Join string to parts with delimiter
301     x_Join();
302 
303     // Compile regular expression.
304     CRegexp re(search, compile_flags);
305     size_t  start_pos = 0;
306 
307     for (size_t count = 0; !(max_replace && count >= max_replace); count++) {
308 
309         // Match pattern.
310         re.GetMatch(m_Content, (int)start_pos, 0, match_flags, true);
311         int num_found = re.NumFound();
312         if (num_found <= 0) {
313             break;
314         }
315 
316         // Substitute all subpatterns "$<digit>" to values in the "replace" string
317         const int* result;
318         string     x_replace(replace.data(), replace.length());
319         size_t     pos = 0;
320 
321         for (;;) {
322             // Find "$"
323             pos = x_replace.find("$", pos);
324             if (pos == NPOS) {
325                 break;
326             }
327             // Try to convert string after the "$" to number
328             errno = 0;
329             const char* startptr = x_replace.c_str() + pos + 1;
330             char* endptr = 0;
331             long value = strtol(startptr, &endptr, 10);
332 
333             if ( errno  ||  endptr == startptr  ||  !endptr  ||
334                  value < kMin_Int  ||  value > kMax_Int) {
335                 // Format error, skip single "$".
336                 pos++;
337                 continue;
338 
339             }
340             int n = (int)value;
341 
342             // Get subpattern value
343             CTempString subpattern;
344             if ( n > 0  &&  n < num_found ) {
345                 result = re.GetResults(n);
346                 if (result[0] >= 0  &&  result[1] >= 0) {
347                     subpattern.assign(m_Content.data() + result[0], result[1] - result[0]);
348                 }
349             }
350 
351             // Check braces {$...}
352             size_t sp_start = pos;
353             size_t sp_end   = endptr - x_replace.c_str();
354             if ( sp_start > 0  &&  x_replace[sp_start-1] == '{') {
355                 sp_start--;
356                 if ( sp_end <  x_replace.length()  &&
357                      x_replace[sp_end] == '}') {
358                     sp_end++;
359                 } else {
360                     // Format error -- missed closed brace.
361                     sp_start++;
362                 }
363             }
364             // Replace $n with subpattern value.
365             x_replace.replace(sp_start, sp_end - sp_start, subpattern.data(), subpattern.length());
366             pos += subpattern.length();
367         }
368 
369         // Replace pattern with "x_replace".
370         result = re.GetResults(0);
371         m_Content.replace(result[0], result[1] - result[0], x_replace);
372         n_replace++;
373         start_pos = result[0] + x_replace.length();
374         // Guard against endless loop when regular expression
375         // can match the empty string.
376         if ( !x_replace.length() &&  result[0] == result[1] )
377             start_pos++;
378     }
379     return n_replace;
380 }
381 
382 
ReplaceRange(CTempStringEx search,CTempString replace,CRegexp::TCompile compile_flags,CRegexp::TMatch match_flags,CRegexpUtil::ERange process_inside,size_t max_replace)383 size_t CRegexpUtil::ReplaceRange(
384     CTempStringEx       search,
385     CTempString         replace,
386     CRegexp::TCompile   compile_flags,
387     CRegexp::TMatch     match_flags,
388     CRegexpUtil::ERange process_inside,
389     size_t              max_replace
390     )
391 {
392     if ( search.empty() ) {
393         return 0;
394     }
395 
396     // Number of replaced strings
397     size_t n_replace = 0;
398 
399     // Split source string to parts by delimiter
400     x_Divide();
401 
402     // Flag which denote that current line is inside "range"
403     bool inside = m_RangeStart.empty();
404     bool close_inside = false;
405 
406     NON_CONST_ITERATE (list<string>, i, m_ContentList) {
407         // Get new line
408         CTempString line(*i);
409 
410         // Check for beginning of block [addr_re_start:addr_re_end]
411         if ( !inside  &&  !m_RangeStart.empty() ) {
412             CRegexp re(m_RangeStart);
413             re.GetMatch(line, 0, 0, CRegexp::fMatch_default, true);
414             inside = (re.NumFound() > 0);
415         } else {
416             inside = true;
417         }
418 
419         // Two addresses were specified?
420         // Check for ending of block [addr_re_start:addr_re_end]
421         // before doing any replacements in the string
422         if ( inside  &&  !m_RangeEnd.empty() ) {
423             CRegexp re(m_RangeEnd);
424             re.GetMatch(line, 0, 0, CRegexp::fMatch_default, true);
425             close_inside = (re.NumFound() > 0);
426         } else {
427             // One address -- process one current string only
428             close_inside = true;
429         }
430 
431         // Process current line
432         if ( (inside   &&  process_inside == eInside)  ||
433              (!inside  &&  process_inside == eOutside) ) {
434             CRegexpUtil re(line);
435             n_replace += re.Replace(search, replace,
436                                     compile_flags, match_flags, max_replace);
437             *i = re; // invalidates CTempString line
438         }
439 
440         // Finish processing block?
441         if ( close_inside ) {
442             inside = false;
443         }
444     }
445 
446     return n_replace;
447 }
448 
x_Divide(CTempString delimiter)449 void CRegexpUtil::x_Divide(CTempString delimiter)
450 {
451     /// Join substrings back to entire string if divided
452     if ( m_IsDivided  ) {
453         if ( delimiter == m_Delimiter ) {
454             return;
455         }
456         x_Join();
457     }
458     m_ContentList.clear();
459 
460     // Split source string to parts by new delimiter
461     size_t pos;
462     size_t start_pos = 0;
463     for (;;) {
464         pos = m_Content.find(delimiter.data(), start_pos, delimiter.length());
465         if (pos == NPOS) {
466             m_ContentList.push_back(m_Content.substr(start_pos));
467             break;
468         } else {
469             m_ContentList.push_back(m_Content.substr(start_pos, pos - start_pos));
470             start_pos = pos + delimiter.length();
471         }
472     }
473     m_IsDivided = true;
474     // Save delimiter for consecutive joining
475     m_Delimiter = delimiter;
476 }
477 
478 
x_Join(void)479 void CRegexpUtil::x_Join(void)
480 {
481     if ( m_IsDivided ) {
482         m_Content = NStr::Join(m_ContentList, m_Delimiter);
483         m_IsDivided = false;
484     }
485 }
486 
GetErrCodeString(void) const487 const char* CRegexpException::GetErrCodeString(void) const
488 {
489     switch ( GetErrCode() ) {
490     case eCompile:    return "eCompile";
491     case eBadFlags:   return "eBadFlags";
492     default:          return CException::GetErrCodeString();
493     }
494 }
495 
496 
497 END_NCBI_SCOPE
498