1 /* $Id: regexp.cpp 625904 2021-02-22 13:32:46Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Vladimir Ivanov, Clifford Clausen
27 *
28 * File Description:
29 * C++ wrappers for Perl Compatible Regular Expression (pcre) library
30 *
31 */
32
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbi_limits.h>
35 #include <corelib/ncbistl.hpp>
36 #include <util/xregexp/regexp.hpp>
37 #include <pcre.h>
38
39 #include <memory>
40 #include <stdlib.h>
41
42 BEGIN_NCBI_SCOPE
43
44
45 //////////////////////////////////////////////////////////////////////////////
46 //
47 // CRegexp
48 //
49
50 // Regular expression meta characters
51 static char s_Special[] = ".?*+$^[](){}/\\|-";
52
53
54 // Macro to check bits
55 #define F_ISSET(flags, mask) ((flags & (mask)) == (mask))
56
57 // Auxiliary functions to convert CRegexp flags to real flags.
s_GetRealCompileFlags(CRegexp::TCompile compile_flags)58 static int s_GetRealCompileFlags(CRegexp::TCompile compile_flags)
59 {
60 int flags = 0;
61
62 if ( !compile_flags &&
63 !F_ISSET(compile_flags, CRegexp::fCompile_default )) {
64 NCBI_THROW(CRegexpException, eBadFlags,
65 "Bad regular expression compilation flags");
66 }
67 if ( F_ISSET(compile_flags, CRegexp::fCompile_ignore_case) ) {
68 flags |= PCRE_CASELESS;
69 }
70 if ( F_ISSET(compile_flags, CRegexp::fCompile_dotall) ) {
71 flags |= PCRE_DOTALL;
72 }
73 if ( F_ISSET(compile_flags, CRegexp::fCompile_newline) ) {
74 flags |= PCRE_MULTILINE;
75 }
76 if ( F_ISSET(compile_flags, CRegexp::fCompile_ungreedy) ) {
77 flags |= PCRE_UNGREEDY;
78 }
79 if ( F_ISSET(compile_flags, CRegexp::fCompile_extended) ) {
80 flags |= PCRE_EXTENDED;
81 }
82 return flags;
83 }
84
s_GetRealMatchFlags(CRegexp::TMatch match_flags)85 static int s_GetRealMatchFlags(CRegexp::TMatch match_flags)
86 {
87 int flags = 0;
88
89 if ( !match_flags &&
90 !F_ISSET(match_flags, CRegexp::fMatch_default) ) {
91 NCBI_THROW(CRegexpException, eBadFlags,
92 "Bad regular expression match flags");
93 }
94 if ( F_ISSET(match_flags, CRegexp::fMatch_not_begin) ) {
95 flags |= PCRE_NOTBOL;
96 }
97 if ( F_ISSET(match_flags, CRegexp::fMatch_not_end) ) {
98 flags |= PCRE_NOTEOL;
99 }
100 return flags;
101 }
102
103
CRegexp(CTempStringEx pattern,TCompile flags)104 CRegexp::CRegexp(CTempStringEx pattern, TCompile flags)
105 : m_PReg(NULL), m_Extra(NULL), m_NumFound(0)
106 {
107 Set(pattern, flags);
108 }
109
110
~CRegexp()111 CRegexp::~CRegexp()
112 {
113 (*pcre_free)(m_PReg);
114 (*pcre_free)(m_Extra);
115 }
116
117
Set(CTempStringEx pattern,TCompile flags)118 void CRegexp::Set(CTempStringEx pattern, TCompile flags)
119 {
120 if ( m_PReg ) {
121 (*pcre_free)(m_PReg);
122 }
123 const char *err;
124 int err_offset;
125 int x_flags = s_GetRealCompileFlags(flags);
126
127 if ( pattern.HasZeroAtEnd() ) {
128 m_PReg = pcre_compile(pattern.data(), x_flags, &err, &err_offset, NULL);
129 } else {
130 m_PReg = pcre_compile(string(pattern).c_str(), x_flags, &err, &err_offset, NULL);
131 }
132 if ( !m_PReg ) {
133 NCBI_THROW(CRegexpException, eCompile, "Compilation of the pattern '" +
134 string(pattern) + "' failed: " + err);
135 }
136 if ( m_Extra ) {
137 (*pcre_free)(m_Extra);
138 }
139 m_Extra = pcre_study((pcre*)m_PReg, 0, &err);
140 }
141
142
143 // @deprecated
GetSub(CTempString str,size_t idx,string & dst) const144 void CRegexp::GetSub(CTempString str, size_t idx, string& dst) const
145 {
146 if ( (int)idx >= m_NumFound ) {
147 dst.erase();
148 return;
149 }
150 int start = m_Results[2 * idx];
151 int end = m_Results[2 * idx + 1];
152 if (start == -1 || end == -1) {
153 dst.erase();
154 } else {
155 dst.assign(str.data() + start, end - start);
156 }
157 }
158
159
GetSub(CTempString str,size_t idx) const160 CTempString CRegexp::GetSub(CTempString str, size_t idx) const
161 {
162 if ( (int)idx >= m_NumFound ) {
163 return CTempString();
164 }
165 int start = m_Results[2 * idx];
166 int end = m_Results[2 * idx + 1];
167 if (start == -1 || end == -1) {
168 return CTempString();
169 }
170 return CTempString(str.data() + start, end - start);
171 }
172
173
GetMatch(CTempString str,size_t offset,size_t idx,TMatch flags,bool noreturn)174 CTempString CRegexp::GetMatch(CTempString str, size_t offset, size_t idx,
175 TMatch flags, bool noreturn)
176 {
177 int x_flags = s_GetRealMatchFlags(flags);
178 m_NumFound = pcre_exec((pcre*)m_PReg, (pcre_extra*)m_Extra, str.data(),
179 (int)str.length(), (int)offset,
180 x_flags, m_Results,
181 (int)(kRegexpMaxSubPatterns +1) * 3);
182 if ( noreturn ) {
183 return CTempString();
184 }
185 return GetSub(str, idx);
186 }
187
188
IsMatch(CTempString str,TMatch flags)189 bool CRegexp::IsMatch(CTempString str, TMatch flags)
190 {
191 int x_flags = s_GetRealMatchFlags(flags);
192 m_NumFound = pcre_exec((pcre*)m_PReg, (pcre_extra*)m_Extra, str.data(),
193 (int)str.length(), 0, x_flags, m_Results,
194 (int)(kRegexpMaxSubPatterns +1) * 3);
195 return m_NumFound > 0;
196 }
197
198
Escape(CTempString str)199 string CRegexp::Escape(CTempString str)
200 {
201 // Find first special character
202 SIZE_TYPE prev = 0;
203 SIZE_TYPE pos = str.find_first_of(s_Special, prev);
204 if ( pos == NPOS ) {
205 // All characters are good - return original string
206 return str;
207 }
208 CNcbiOstrstream out;
209 do {
210 // Write first good characters in one chunk
211 out.write(str.data() + prev, pos - prev);
212 // Escape char
213 out.put('\\');
214 out.put(str[pos]);
215 // Find next
216 prev = pos + 1;
217 pos = str.find_first_of(s_Special, prev);
218 } while (pos != NPOS);
219
220 // Write remaining part of the string
221 out.write(str.data() + prev, str.length() - prev);
222 // Return encoded string
223 return CNcbiOstrstreamToString(out);
224 }
225
226
WildcardToRegexp(CTempString mask)227 string CRegexp::WildcardToRegexp(CTempString mask)
228 {
229 // Find first special character
230 SIZE_TYPE prev = 0;
231 SIZE_TYPE pos = mask.find_first_of(s_Special, prev);
232 if ( pos == NPOS ) {
233 // All characters are good - return original string
234 return mask;
235 }
236 CNcbiOstrstream out;
237 do {
238 // Write first good characters in one chunk
239 out.write(mask.data() + prev, pos - prev);
240 // Convert or escape found character
241 if (mask[pos] == '*') {
242 out.put('.');
243 out.put(mask[pos]);
244 } else if (mask[pos] == '?') {
245 out.put('.');
246 } else {
247 // Escape character
248 out.put('\\');
249 out.put(mask[pos]);
250 }
251 // Find next
252 prev = pos + 1;
253 pos = mask.find_first_of(s_Special, prev);
254 } while (pos != NPOS);
255
256 // Write remaining part of the string
257 out.write(mask.data() + prev, mask.length() - prev);
258 // Return encoded string
259 return CNcbiOstrstreamToString(out);
260 }
261
262
263 //////////////////////////////////////////////////////////////////////////////
264 //
265 // CRegexpUtil
266 //
267
CRegexpUtil(CTempString str)268 CRegexpUtil::CRegexpUtil(CTempString str)
269 : m_Delimiter("\n")
270 {
271 Reset(str);
272 return;
273 }
274
275
SetRange(CTempStringEx addr_start,CTempStringEx addr_end,CTempString delimiter)276 void CRegexpUtil::SetRange(
277 CTempStringEx addr_start,
278 CTempStringEx addr_end,
279 CTempString delimiter)
280 {
281 m_RangeStart = addr_start;
282 m_RangeEnd = addr_end;
283 m_Delimiter = delimiter;
284 x_Divide(delimiter);
285 }
286
287
Replace(CTempStringEx search,CTempString replace,CRegexp::TCompile compile_flags,CRegexp::TMatch match_flags,size_t max_replace)288 size_t CRegexpUtil::Replace(
289 CTempStringEx search,
290 CTempString replace,
291 CRegexp::TCompile compile_flags,
292 CRegexp::TMatch match_flags,
293 size_t max_replace)
294 {
295 if ( search.empty() ) {
296 return 0;
297 }
298 size_t n_replace = 0;
299
300 // Join string to parts with delimiter
301 x_Join();
302
303 // Compile regular expression.
304 CRegexp re(search, compile_flags);
305 size_t start_pos = 0;
306
307 for (size_t count = 0; !(max_replace && count >= max_replace); count++) {
308
309 // Match pattern.
310 re.GetMatch(m_Content, (int)start_pos, 0, match_flags, true);
311 int num_found = re.NumFound();
312 if (num_found <= 0) {
313 break;
314 }
315
316 // Substitute all subpatterns "$<digit>" to values in the "replace" string
317 const int* result;
318 string x_replace(replace.data(), replace.length());
319 size_t pos = 0;
320
321 for (;;) {
322 // Find "$"
323 pos = x_replace.find("$", pos);
324 if (pos == NPOS) {
325 break;
326 }
327 // Try to convert string after the "$" to number
328 errno = 0;
329 const char* startptr = x_replace.c_str() + pos + 1;
330 char* endptr = 0;
331 long value = strtol(startptr, &endptr, 10);
332
333 if ( errno || endptr == startptr || !endptr ||
334 value < kMin_Int || value > kMax_Int) {
335 // Format error, skip single "$".
336 pos++;
337 continue;
338
339 }
340 int n = (int)value;
341
342 // Get subpattern value
343 CTempString subpattern;
344 if ( n > 0 && n < num_found ) {
345 result = re.GetResults(n);
346 if (result[0] >= 0 && result[1] >= 0) {
347 subpattern.assign(m_Content.data() + result[0], result[1] - result[0]);
348 }
349 }
350
351 // Check braces {$...}
352 size_t sp_start = pos;
353 size_t sp_end = endptr - x_replace.c_str();
354 if ( sp_start > 0 && x_replace[sp_start-1] == '{') {
355 sp_start--;
356 if ( sp_end < x_replace.length() &&
357 x_replace[sp_end] == '}') {
358 sp_end++;
359 } else {
360 // Format error -- missed closed brace.
361 sp_start++;
362 }
363 }
364 // Replace $n with subpattern value.
365 x_replace.replace(sp_start, sp_end - sp_start, subpattern.data(), subpattern.length());
366 pos += subpattern.length();
367 }
368
369 // Replace pattern with "x_replace".
370 result = re.GetResults(0);
371 m_Content.replace(result[0], result[1] - result[0], x_replace);
372 n_replace++;
373 start_pos = result[0] + x_replace.length();
374 // Guard against endless loop when regular expression
375 // can match the empty string.
376 if ( !x_replace.length() && result[0] == result[1] )
377 start_pos++;
378 }
379 return n_replace;
380 }
381
382
ReplaceRange(CTempStringEx search,CTempString replace,CRegexp::TCompile compile_flags,CRegexp::TMatch match_flags,CRegexpUtil::ERange process_inside,size_t max_replace)383 size_t CRegexpUtil::ReplaceRange(
384 CTempStringEx search,
385 CTempString replace,
386 CRegexp::TCompile compile_flags,
387 CRegexp::TMatch match_flags,
388 CRegexpUtil::ERange process_inside,
389 size_t max_replace
390 )
391 {
392 if ( search.empty() ) {
393 return 0;
394 }
395
396 // Number of replaced strings
397 size_t n_replace = 0;
398
399 // Split source string to parts by delimiter
400 x_Divide();
401
402 // Flag which denote that current line is inside "range"
403 bool inside = m_RangeStart.empty();
404 bool close_inside = false;
405
406 NON_CONST_ITERATE (list<string>, i, m_ContentList) {
407 // Get new line
408 CTempString line(*i);
409
410 // Check for beginning of block [addr_re_start:addr_re_end]
411 if ( !inside && !m_RangeStart.empty() ) {
412 CRegexp re(m_RangeStart);
413 re.GetMatch(line, 0, 0, CRegexp::fMatch_default, true);
414 inside = (re.NumFound() > 0);
415 } else {
416 inside = true;
417 }
418
419 // Two addresses were specified?
420 // Check for ending of block [addr_re_start:addr_re_end]
421 // before doing any replacements in the string
422 if ( inside && !m_RangeEnd.empty() ) {
423 CRegexp re(m_RangeEnd);
424 re.GetMatch(line, 0, 0, CRegexp::fMatch_default, true);
425 close_inside = (re.NumFound() > 0);
426 } else {
427 // One address -- process one current string only
428 close_inside = true;
429 }
430
431 // Process current line
432 if ( (inside && process_inside == eInside) ||
433 (!inside && process_inside == eOutside) ) {
434 CRegexpUtil re(line);
435 n_replace += re.Replace(search, replace,
436 compile_flags, match_flags, max_replace);
437 *i = re; // invalidates CTempString line
438 }
439
440 // Finish processing block?
441 if ( close_inside ) {
442 inside = false;
443 }
444 }
445
446 return n_replace;
447 }
448
x_Divide(CTempString delimiter)449 void CRegexpUtil::x_Divide(CTempString delimiter)
450 {
451 /// Join substrings back to entire string if divided
452 if ( m_IsDivided ) {
453 if ( delimiter == m_Delimiter ) {
454 return;
455 }
456 x_Join();
457 }
458 m_ContentList.clear();
459
460 // Split source string to parts by new delimiter
461 size_t pos;
462 size_t start_pos = 0;
463 for (;;) {
464 pos = m_Content.find(delimiter.data(), start_pos, delimiter.length());
465 if (pos == NPOS) {
466 m_ContentList.push_back(m_Content.substr(start_pos));
467 break;
468 } else {
469 m_ContentList.push_back(m_Content.substr(start_pos, pos - start_pos));
470 start_pos = pos + delimiter.length();
471 }
472 }
473 m_IsDivided = true;
474 // Save delimiter for consecutive joining
475 m_Delimiter = delimiter;
476 }
477
478
x_Join(void)479 void CRegexpUtil::x_Join(void)
480 {
481 if ( m_IsDivided ) {
482 m_Content = NStr::Join(m_ContentList, m_Delimiter);
483 m_IsDivided = false;
484 }
485 }
486
GetErrCodeString(void) const487 const char* CRegexpException::GetErrCodeString(void) const
488 {
489 switch ( GetErrCode() ) {
490 case eCompile: return "eCompile";
491 case eBadFlags: return "eBadFlags";
492 default: return CException::GetErrCodeString();
493 }
494 }
495
496
497 END_NCBI_SCOPE
498