1 //////////////////////////////////////////////////////////////////////// 2 // 3 // Copyright (C) 2002-2021 The Octave Project Developers 4 // 5 // See the file COPYRIGHT.md in the top-level directory of this 6 // distribution or <https://octave.org/copyright/>. 7 // 8 // This file is part of Octave. 9 // 10 // Octave is free software: you can redistribute it and/or modify it 11 // under the terms of the GNU General Public License as published by 12 // the Free Software Foundation, either version 3 of the License, or 13 // (at your option) any later version. 14 // 15 // Octave is distributed in the hope that it will be useful, but 16 // WITHOUT ANY WARRANTY; without even the implied warranty of 17 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 // GNU General Public License for more details. 19 // 20 // You should have received a copy of the GNU General Public License 21 // along with Octave; see the file COPYING. If not, see 22 // <https://www.gnu.org/licenses/>. 23 // 24 //////////////////////////////////////////////////////////////////////// 25 26 #if defined (HAVE_CONFIG_H) 27 # include "config.h" 28 #endif 29 30 #include <list> 31 #include <sstream> 32 #include <string> 33 #include <vector> 34 35 #if defined (HAVE_PCRE_H) 36 # include <pcre.h> 37 #elif defined (HAVE_PCRE_PCRE_H) 38 # include <pcre/pcre.h> 39 #endif 40 41 #include "Matrix.h" 42 #include "base-list.h" 43 #include "lo-error.h" 44 #include "oct-locbuf.h" 45 #include "quit.h" 46 #include "lo-regexp.h" 47 #include "str-vec.h" 48 #include "unistr-wrappers.h" 49 50 namespace octave 51 { 52 // Define the maximum number of retries for a pattern 53 // that possibly results in an infinite recursion. 54 #define PCRE_MATCHLIMIT_MAX 10 55 56 // FIXME: should this be configurable? 57 #define MAXLOOKBEHIND 10 58 59 static bool lookbehind_warned = false; 60 61 // FIXME: don't bother collecting and composing return values 62 // the user doesn't want. 63 64 void free(void)65 regexp::free (void) 66 { 67 if (m_data) 68 pcre_free (static_cast<pcre *> (m_data)); 69 } 70 71 void compile_internal(void)72 regexp::compile_internal (void) 73 { 74 // If we had a previously compiled pattern, release it. 75 free (); 76 77 std::size_t max_length = MAXLOOKBEHIND; 78 79 std::size_t pos = 0; 80 std::size_t new_pos; 81 int inames = 0; 82 std::ostringstream buf; 83 84 while ((new_pos = m_pattern.find ("(?", pos)) != std::string::npos) 85 { 86 if (m_pattern.at (new_pos + 2) == '<' 87 && !(m_pattern.at (new_pos + 3) == '=' 88 || m_pattern.at (new_pos + 3) == '!')) 89 { 90 // The syntax of named tokens in pcre is "(?P<name>...)" while 91 // we need a syntax "(?<name>...)", so fix that here. Also an 92 // expression like 93 // "(?<first>\w+)\s+(?<last>\w+)|(?<last>\w+),\s+(?<first>\w+)" 94 // should be perfectly legal, while pcre does not allow the same 95 // named token name on both sides of the alternative. Also fix 96 // that here by replacing name tokens by dummy names, and dealing 97 // with the dummy names later. 98 99 std::size_t tmp_pos = m_pattern.find_first_of ('>', new_pos); 100 101 if (tmp_pos == std::string::npos) 102 (*current_liboctave_error_handler) 103 ("regexp: syntax error in pattern"); 104 105 std::string tmp_name 106 = m_pattern.substr (new_pos+3, tmp_pos-new_pos-3); 107 108 bool found = false; 109 110 for (int i = 0; i < m_names; i++) 111 { 112 if (m_named_pats(i) == tmp_name) 113 { 114 m_named_idx.resize (dim_vector (inames+1, 1)); 115 m_named_idx(inames) = i; 116 found = true; 117 break; 118 } 119 } 120 121 if (! found) 122 { 123 m_named_idx.resize (dim_vector (inames+1, 1)); 124 m_named_idx(inames) = m_names; 125 m_named_pats.append (tmp_name); 126 m_names++; 127 } 128 129 if (new_pos - pos > 0) 130 buf << m_pattern.substr (pos, new_pos-pos); 131 if (inames < 10) 132 buf << "(?P<n00" << inames++; 133 else if (inames < 100) 134 buf << "(?P<n0" << inames++; 135 else 136 buf << "(?P<n" << inames++; 137 138 pos = tmp_pos; 139 } 140 else if (m_pattern.at (new_pos + 2) == '<') 141 { 142 // Find lookbehind operators of arbitrary length (ie like 143 // "(?<=[a-z]*)") and replace with a maximum length operator 144 // as PCRE can not yet handle arbitrary length lookahead 145 // operators. Use the string length as the maximum length to 146 // avoid issues. 147 148 int brackets = 1; 149 std::size_t tmp_pos1 = new_pos + 2; 150 std::size_t tmp_pos2 = tmp_pos1; 151 152 while (tmp_pos1 < m_pattern.length () && brackets > 0) 153 { 154 char ch = m_pattern.at (tmp_pos1); 155 156 if (ch == '(') 157 brackets++; 158 else if (ch == ')') 159 { 160 if (brackets > 1) 161 tmp_pos2 = tmp_pos1; 162 163 brackets--; 164 } 165 166 tmp_pos1++; 167 } 168 169 if (brackets != 0) 170 { 171 buf << m_pattern.substr (pos, new_pos - pos) << "(?"; 172 pos = new_pos + 2; 173 } 174 else 175 { 176 std::size_t tmp_pos3 = m_pattern.find_first_of ("*+", tmp_pos2); 177 178 if (tmp_pos3 != std::string::npos && tmp_pos3 < tmp_pos1) 179 { 180 if (! lookbehind_warned) 181 { 182 lookbehind_warned = true; 183 (*current_liboctave_warning_with_id_handler) 184 ("Octave:regexp-lookbehind-limit", 185 "%s: arbitrary length lookbehind patterns are only supported up to length %d", 186 m_who.c_str (), MAXLOOKBEHIND); 187 } 188 189 buf << m_pattern.substr (pos, new_pos - pos) << '('; 190 191 std::size_t i; 192 193 if (m_pattern.at (tmp_pos3) == '*') 194 i = 0; 195 else 196 i = 1; 197 198 for (; i < max_length + 1; i++) 199 { 200 buf << m_pattern.substr (new_pos, tmp_pos3 - new_pos) 201 << '{' << i << '}'; 202 buf << m_pattern.substr (tmp_pos3 + 1, 203 tmp_pos1 - tmp_pos3 - 1); 204 if (i != max_length) 205 buf << '|'; 206 } 207 buf << ')'; 208 } 209 else 210 buf << m_pattern.substr (pos, tmp_pos1 - pos); 211 212 pos = tmp_pos1; 213 } 214 } 215 else 216 { 217 buf << m_pattern.substr (pos, new_pos - pos) << "(?"; 218 pos = new_pos + 2; 219 } 220 221 } 222 223 buf << m_pattern.substr (pos); 224 225 // Replace NULLs with escape sequence because conversion function c_str() 226 // will terminate string early at embedded NULLs. 227 std::string buf_str = buf.str (); 228 while ((pos = buf_str.find ('\0')) != std::string::npos) 229 buf_str.replace (pos, 1, "\\000"); 230 231 const char *err; 232 int erroffset; 233 234 int pcre_options 235 = ( (m_options.case_insensitive () ? PCRE_CASELESS : 0) 236 | (m_options.dotexceptnewline () ? 0 : PCRE_DOTALL) 237 | (m_options.lineanchors () ? PCRE_MULTILINE : 0) 238 | (m_options.freespacing () ? PCRE_EXTENDED : 0) 239 | PCRE_UTF8); 240 241 m_data = pcre_compile (buf_str.c_str (), pcre_options, 242 &err, &erroffset, nullptr); 243 244 if (! m_data) 245 (*current_liboctave_error_handler) 246 ("%s: %s at position %d of expression", m_who.c_str (), err, erroffset); 247 } 248 249 regexp::match_data match(const std::string & buffer)250 regexp::match (const std::string& buffer) 251 { 252 // check if input is valid utf-8 253 const uint8_t *buf_str = reinterpret_cast<const uint8_t *> (buffer.c_str ()); 254 if (octave_u8_check_wrapper (buf_str, buffer.length ())) 255 (*current_liboctave_error_handler) 256 ("%s: the input string is invalid UTF-8", m_who.c_str ()); 257 258 regexp::match_data retval; 259 260 std::list<regexp::match_element> lst; 261 262 int subpatterns; 263 int namecount; 264 int nameentrysize; 265 char *nametable; 266 std::size_t idx = 0; 267 268 pcre *re = static_cast<pcre *> (m_data); 269 270 pcre_fullinfo (re, nullptr, PCRE_INFO_CAPTURECOUNT, &subpatterns); 271 pcre_fullinfo (re, nullptr, PCRE_INFO_NAMECOUNT, &namecount); 272 pcre_fullinfo (re, nullptr, PCRE_INFO_NAMEENTRYSIZE, &nameentrysize); 273 pcre_fullinfo (re, nullptr, PCRE_INFO_NAMETABLE, &nametable); 274 275 OCTAVE_LOCAL_BUFFER (int, ovector, (subpatterns+1)*3); 276 OCTAVE_LOCAL_BUFFER (int, nidx, namecount); 277 278 for (int i = 0; i < namecount; i++) 279 { 280 // Index of subpattern in first two bytes of name (MSB first). 281 // Extract index. 282 nidx[i] = (static_cast<int> (nametable[i*nameentrysize])) << 8 283 | static_cast<int> (nametable[i*nameentrysize+1]); 284 } 285 286 while (true) 287 { 288 octave_quit (); 289 290 int matches = pcre_exec (re, nullptr, buffer.c_str (), 291 buffer.length (), idx, 292 PCRE_NO_UTF8_CHECK | (idx ? PCRE_NOTBOL : 0), 293 ovector, (subpatterns+1)*3); 294 295 if (matches == PCRE_ERROR_MATCHLIMIT) 296 { 297 // Try harder; start with default value for MATCH_LIMIT 298 // and increase it. 299 (*current_liboctave_warning_with_id_handler) 300 ("Octave:regexp-match-limit", 301 "your pattern caused PCRE to hit its MATCH_LIMIT; trying harder now, but this will be slow"); 302 303 pcre_extra pe; 304 305 pcre_config (PCRE_CONFIG_MATCH_LIMIT, 306 static_cast<void *> (&pe.match_limit)); 307 308 pe.flags = PCRE_EXTRA_MATCH_LIMIT; 309 310 int i = 0; 311 while (matches == PCRE_ERROR_MATCHLIMIT 312 && i++ < PCRE_MATCHLIMIT_MAX) 313 { 314 octave_quit (); 315 316 pe.match_limit *= 10; 317 matches = pcre_exec (re, &pe, buffer.c_str (), 318 buffer.length (), idx, 319 PCRE_NO_UTF8_CHECK 320 | (idx ? PCRE_NOTBOL : 0), 321 ovector, (subpatterns+1)*3); 322 } 323 } 324 325 if (matches < 0 && matches != PCRE_ERROR_NOMATCH) 326 (*current_liboctave_error_handler) 327 ("%s: internal error calling pcre_exec; " 328 "error code from pcre_exec is %i", m_who.c_str (), matches); 329 330 if (matches == PCRE_ERROR_NOMATCH) 331 break; 332 else if (ovector[0] >= ovector[1] && ! m_options.emptymatch ()) 333 { 334 // Zero length match. Skip to next char. 335 idx = ovector[0] + 1; 336 if (idx < buffer.length ()) 337 continue; 338 else 339 break; 340 } 341 else 342 { 343 int pos_match = 0; 344 Matrix token_extents (matches-1, 2); 345 346 for (int i = 1; i < matches; i++) 347 { 348 if (ovector[2*i] >= 0 && ovector[2*i+1] > 0 349 && (i == 1 || ovector[2*i] != ovector[2*i-2] 350 || ovector[2*i-1] != ovector[2*i+1])) 351 { 352 token_extents(pos_match,0) = double (ovector[2*i]+1); 353 token_extents(pos_match++,1) = double (ovector[2*i+1]); 354 } 355 } 356 357 token_extents.resize (pos_match, 2); 358 359 double start = double (ovector[0]+1); 360 double end = double (ovector[1]); 361 362 const char **listptr; 363 int status = pcre_get_substring_list (buffer.c_str (), ovector, 364 matches, &listptr); 365 366 if (status == PCRE_ERROR_NOMEMORY) 367 (*current_liboctave_error_handler) 368 ("%s: cannot allocate memory in pcre_get_substring_list", 369 m_who.c_str ()); 370 371 // Must use explicit length constructor as match can contain '\0'. 372 std::string match_string = std::string (*listptr, end - start + 1); 373 374 string_vector tokens (pos_match); 375 string_vector named_tokens (m_names); 376 int pos_offset = 0; 377 pos_match = 0; 378 379 for (int i = 1; i < matches; i++) 380 { 381 if (ovector[2*i] >= 0 && ovector[2*i+1] > 0) 382 { 383 if (i == 1 || ovector[2*i] != ovector[2*i-2] 384 || ovector[2*i-1] != ovector[2*i+1]) 385 { 386 if (namecount > 0) 387 { 388 // FIXME: Should probably do this with a map() 389 // rather than a linear search. However, 390 // the number of captured, named expressions 391 // is usually pretty small (< 4) 392 for (int j = 0; j < namecount; j++) 393 { 394 if (nidx[j] == i) 395 { 396 std::size_t len = ovector[2*i+1] - ovector[2*i]; 397 named_tokens(m_named_idx(j)) 398 = std::string (*(listptr+i-pos_offset), 399 len); 400 break; 401 } 402 } 403 } 404 405 std::size_t len = ovector[2*i+1] - ovector[2*i]; 406 tokens(pos_match++) = std::string (*(listptr+i), len); 407 } 408 else 409 pos_offset++; 410 } 411 } 412 413 pcre_free_substring_list (listptr); 414 415 regexp::match_element new_elem (named_tokens, tokens, match_string, 416 token_extents, start, end); 417 lst.push_back (new_elem); 418 419 if (ovector[1] <= ovector[0]) 420 { 421 // Zero length match. Skip to next char. 422 idx = ovector[0] + 1; 423 if (idx <= buffer.length ()) 424 continue; 425 } 426 else 427 idx = ovector[1]; 428 429 if (m_options.once () || idx >= buffer.length ()) 430 break; 431 } 432 } 433 434 retval = regexp::match_data (lst, m_named_pats); 435 436 return retval; 437 } 438 439 bool is_match(const std::string & buffer)440 regexp::is_match (const std::string& buffer) 441 { 442 regexp::match_data rx_lst = match (buffer); 443 444 return rx_lst.size () > 0; 445 } 446 447 Array<bool> is_match(const string_vector & buffer)448 regexp::is_match (const string_vector& buffer) 449 { 450 octave_idx_type len = buffer.numel (); 451 452 Array<bool> retval (dim_vector (len, 1)); 453 454 for (octave_idx_type i = 0; i < buffer.numel (); i++) 455 retval(i) = is_match (buffer(i)); 456 457 return retval; 458 } 459 460 // Declare rep_token_t used in processing replacement string 461 typedef struct 462 { 463 std::size_t pos; 464 int num; 465 } rep_token_t; 466 467 std::string replace(const std::string & buffer,const std::string & replacement)468 regexp::replace (const std::string& buffer, const std::string& replacement) 469 { 470 std::string retval; 471 472 const regexp::match_data rx_lst = match (buffer); 473 474 std::size_t num_matches = rx_lst.size (); 475 476 if (num_matches == 0) 477 { 478 retval = buffer; 479 return retval; 480 } 481 482 // Identify replacement tokens; build a vector of group numbers in 483 // the replacement string so that we can quickly calculate the size 484 // of the replacement. 485 486 // FIXME: All code assumes that only 10 tokens ($0-$9) exist. 487 // $11 represents $1 followed by the character '1' rather than 488 // the eleventh capture buffer. 489 490 std::string repstr = replacement; 491 std::vector<rep_token_t> tokens; 492 tokens.reserve (5); // Reserve memory for 5 pattern replacements 493 494 for (std::size_t i=0; i < repstr.size (); i++) 495 { 496 if (repstr[i] == '\\') 497 { 498 if (i < repstr.size () - 1 && repstr[i+1] == '$') 499 { 500 repstr.erase (i,1); // erase backslash 501 i++; // skip over '$' 502 continue; 503 } 504 if (i < repstr.size () - 1 && repstr[i+1] == '\\') 505 { 506 repstr.erase (i,1); // erase 1st backslash 507 continue; 508 } 509 } 510 else if (repstr[i] == '$') 511 { 512 if (i < repstr.size () - 1 && isdigit (repstr[i+1])) 513 { 514 rep_token_t tmp_token; 515 516 tmp_token.pos = i; 517 tmp_token.num = repstr[i+1]-'0'; 518 tokens.push_back (tmp_token); 519 } 520 } 521 } 522 523 std::string rep; 524 int num_tokens = tokens.size (); 525 526 if (num_tokens > 0) 527 { 528 // Determine replacement length 529 const std::size_t replen = repstr.size () - 2*num_tokens; 530 int delta = 0; 531 auto p = rx_lst.begin (); 532 for (std::size_t i = 0; i < num_matches; i++) 533 { 534 octave_quit (); 535 536 double start = p->start (); 537 double end = p->end (); 538 539 const Matrix pairs (p->token_extents ()); 540 std::size_t pairlen = 0; 541 for (int j = 0; j < num_tokens; j++) 542 { 543 if (tokens[j].num == 0) 544 pairlen += static_cast<std::size_t> (end - start + 1); 545 else if (tokens[j].num <= pairs.rows ()) 546 pairlen += static_cast<std::size_t> (pairs(tokens[j].num-1,1) 547 - pairs(tokens[j].num-1,0) 548 + 1); 549 } 550 delta += (static_cast<int> (replen + pairlen) 551 - static_cast<int> (end - start + 1)); 552 p++; 553 } 554 555 // Build replacement string 556 rep.reserve (buffer.size () + delta); 557 std::size_t from = 0; 558 p = rx_lst.begin (); 559 for (std::size_t i = 0; i < num_matches; i++) 560 { 561 octave_quit (); 562 563 double start = p->start (); 564 double end = p->end (); 565 566 const Matrix pairs (p->token_extents ()); 567 rep.append (&buffer[from], static_cast<std::size_t> (start - 1 - from)); 568 from = static_cast<std::size_t> (end); 569 570 std::size_t cur_pos = 0; 571 572 for (int j = 0; j < num_tokens; j++) 573 { 574 rep.append (&repstr[cur_pos], (tokens[j].pos) - cur_pos); 575 cur_pos = tokens[j].pos+2; 576 577 int k = tokens[j].num; 578 if (k == 0) 579 { 580 // replace with entire match 581 rep.append (&buffer[static_cast<std::size_t> (end - 1)], 582 static_cast<std::size_t> (end - start + 1)); 583 } 584 else if (k <= pairs.rows ()) 585 { 586 // replace with group capture 587 rep.append (&buffer[static_cast<std::size_t> (pairs(k-1,0)-1)], 588 static_cast<std::size_t> (pairs(k-1,1) 589 - pairs(k-1,0) + 1)); 590 } 591 else 592 { 593 // replace with nothing 594 } 595 } 596 if (cur_pos < repstr.size ()) 597 rep.append (&repstr[cur_pos], repstr.size () - cur_pos); 598 599 p++; 600 } 601 rep.append (&buffer[from], buffer.size () - from); 602 } 603 else 604 { 605 // Determine repstr length 606 const std::size_t replen = repstr.size (); 607 int delta = 0; 608 auto p = rx_lst.begin (); 609 for (std::size_t i = 0; i < num_matches; i++) 610 { 611 octave_quit (); 612 613 delta += static_cast<int> (replen) 614 - static_cast<int> (p->end () - p->start () + 1); 615 p++; 616 } 617 618 // Build replacement string 619 rep.reserve (buffer.size () + delta); 620 std::size_t from = 0; 621 p = rx_lst.begin (); 622 for (std::size_t i = 0; i < num_matches; i++) 623 { 624 octave_quit (); 625 626 rep.append (&buffer[from], 627 static_cast<std::size_t> (p->start () - 1 - from)); 628 from = static_cast<std::size_t> (p->end ()); 629 rep.append (repstr); 630 p++; 631 } 632 rep.append (&buffer[from], buffer.size () - from); 633 } 634 635 retval = rep; 636 return retval; 637 } 638 } 639