1 ////////////////////////////////////////////////////////////////////////
2 //
3 // Copyright (C) 2002-2021 The Octave Project Developers
4 //
5 // See the file COPYRIGHT.md in the top-level directory of this
6 // distribution or <https://octave.org/copyright/>.
7 //
8 // This file is part of Octave.
9 //
10 // Octave is free software: you can redistribute it and/or modify it
11 // under the terms of the GNU General Public License as published by
12 // the Free Software Foundation, either version 3 of the License, or
13 // (at your option) any later version.
14 //
15 // Octave is distributed in the hope that it will be useful, but
16 // WITHOUT ANY WARRANTY; without even the implied warranty of
17 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18 // GNU General Public License for more details.
19 //
20 // You should have received a copy of the GNU General Public License
21 // along with Octave; see the file COPYING.  If not, see
22 // <https://www.gnu.org/licenses/>.
23 //
24 ////////////////////////////////////////////////////////////////////////
25 
26 #if defined (HAVE_CONFIG_H)
27 #  include "config.h"
28 #endif
29 
30 #include <list>
31 #include <sstream>
32 #include <string>
33 #include <vector>
34 
35 #if defined (HAVE_PCRE_H)
36 #  include <pcre.h>
37 #elif defined (HAVE_PCRE_PCRE_H)
38 #  include <pcre/pcre.h>
39 #endif
40 
41 #include "Matrix.h"
42 #include "base-list.h"
43 #include "lo-error.h"
44 #include "oct-locbuf.h"
45 #include "quit.h"
46 #include "lo-regexp.h"
47 #include "str-vec.h"
48 #include "unistr-wrappers.h"
49 
50 namespace octave
51 {
52   // Define the maximum number of retries for a pattern
53   // that possibly results in an infinite recursion.
54 #define PCRE_MATCHLIMIT_MAX 10
55 
56   // FIXME: should this be configurable?
57 #define MAXLOOKBEHIND 10
58 
59   static bool lookbehind_warned = false;
60 
61   // FIXME: don't bother collecting and composing return values
62   //        the user doesn't want.
63 
64   void
free(void)65   regexp::free (void)
66   {
67     if (m_data)
68       pcre_free (static_cast<pcre *> (m_data));
69   }
70 
71   void
compile_internal(void)72   regexp::compile_internal (void)
73   {
74     // If we had a previously compiled pattern, release it.
75     free ();
76 
77     std::size_t max_length = MAXLOOKBEHIND;
78 
79     std::size_t pos = 0;
80     std::size_t new_pos;
81     int inames = 0;
82     std::ostringstream buf;
83 
84     while ((new_pos = m_pattern.find ("(?", pos)) != std::string::npos)
85       {
86         if (m_pattern.at (new_pos + 2) == '<'
87             && !(m_pattern.at (new_pos + 3) == '='
88                  || m_pattern.at (new_pos + 3) == '!'))
89           {
90             // The syntax of named tokens in pcre is "(?P<name>...)" while
91             // we need a syntax "(?<name>...)", so fix that here.  Also an
92             // expression like
93             // "(?<first>\w+)\s+(?<last>\w+)|(?<last>\w+),\s+(?<first>\w+)"
94             // should be perfectly legal, while pcre does not allow the same
95             // named token name on both sides of the alternative.  Also fix
96             // that here by replacing name tokens by dummy names, and dealing
97             // with the dummy names later.
98 
99             std::size_t tmp_pos = m_pattern.find_first_of ('>', new_pos);
100 
101             if (tmp_pos == std::string::npos)
102               (*current_liboctave_error_handler)
103                 ("regexp: syntax error in pattern");
104 
105             std::string tmp_name
106               = m_pattern.substr (new_pos+3, tmp_pos-new_pos-3);
107 
108             bool found = false;
109 
110             for (int i = 0; i < m_names; i++)
111               {
112                 if (m_named_pats(i) == tmp_name)
113                   {
114                     m_named_idx.resize (dim_vector (inames+1, 1));
115                     m_named_idx(inames) = i;
116                     found = true;
117                     break;
118                   }
119               }
120 
121             if (! found)
122               {
123                 m_named_idx.resize (dim_vector (inames+1, 1));
124                 m_named_idx(inames) = m_names;
125                 m_named_pats.append (tmp_name);
126                 m_names++;
127               }
128 
129             if (new_pos - pos > 0)
130               buf << m_pattern.substr (pos, new_pos-pos);
131             if (inames < 10)
132               buf << "(?P<n00" << inames++;
133             else if (inames < 100)
134               buf << "(?P<n0" << inames++;
135             else
136               buf << "(?P<n" << inames++;
137 
138             pos = tmp_pos;
139           }
140         else if (m_pattern.at (new_pos + 2) == '<')
141           {
142             // Find lookbehind operators of arbitrary length (ie like
143             // "(?<=[a-z]*)") and replace with a maximum length operator
144             // as PCRE can not yet handle arbitrary length lookahead
145             // operators.  Use the string length as the maximum length to
146             // avoid issues.
147 
148             int brackets = 1;
149             std::size_t tmp_pos1 = new_pos + 2;
150             std::size_t tmp_pos2 = tmp_pos1;
151 
152             while (tmp_pos1 < m_pattern.length () && brackets > 0)
153               {
154                 char ch = m_pattern.at (tmp_pos1);
155 
156                 if (ch == '(')
157                   brackets++;
158                 else if (ch == ')')
159                   {
160                     if (brackets > 1)
161                       tmp_pos2 = tmp_pos1;
162 
163                     brackets--;
164                   }
165 
166                 tmp_pos1++;
167               }
168 
169             if (brackets != 0)
170               {
171                 buf << m_pattern.substr (pos, new_pos - pos) << "(?";
172                 pos = new_pos + 2;
173               }
174             else
175               {
176                 std::size_t tmp_pos3 = m_pattern.find_first_of ("*+", tmp_pos2);
177 
178                 if (tmp_pos3 != std::string::npos && tmp_pos3 < tmp_pos1)
179                   {
180                     if (! lookbehind_warned)
181                       {
182                         lookbehind_warned = true;
183                         (*current_liboctave_warning_with_id_handler)
184                           ("Octave:regexp-lookbehind-limit",
185                            "%s: arbitrary length lookbehind patterns are only supported up to length %d",
186                            m_who.c_str (), MAXLOOKBEHIND);
187                       }
188 
189                     buf << m_pattern.substr (pos, new_pos - pos) << '(';
190 
191                     std::size_t i;
192 
193                     if (m_pattern.at (tmp_pos3) == '*')
194                       i = 0;
195                     else
196                       i = 1;
197 
198                     for (; i < max_length + 1; i++)
199                       {
200                         buf << m_pattern.substr (new_pos, tmp_pos3 - new_pos)
201                             << '{' << i << '}';
202                         buf << m_pattern.substr (tmp_pos3 + 1,
203                                                  tmp_pos1 - tmp_pos3 - 1);
204                         if (i != max_length)
205                           buf << '|';
206                       }
207                     buf << ')';
208                   }
209                 else
210                   buf << m_pattern.substr (pos, tmp_pos1 - pos);
211 
212                 pos = tmp_pos1;
213               }
214           }
215         else
216           {
217             buf << m_pattern.substr (pos, new_pos - pos) << "(?";
218             pos = new_pos + 2;
219           }
220 
221       }
222 
223     buf << m_pattern.substr (pos);
224 
225     // Replace NULLs with escape sequence because conversion function c_str()
226     // will terminate string early at embedded NULLs.
227     std::string buf_str = buf.str ();
228     while ((pos = buf_str.find ('\0')) != std::string::npos)
229       buf_str.replace (pos, 1, "\\000");
230 
231     const char *err;
232     int erroffset;
233 
234     int pcre_options
235       = (  (m_options.case_insensitive () ? PCRE_CASELESS : 0)
236          | (m_options.dotexceptnewline () ? 0 : PCRE_DOTALL)
237          | (m_options.lineanchors () ? PCRE_MULTILINE : 0)
238          | (m_options.freespacing () ? PCRE_EXTENDED : 0)
239          | PCRE_UTF8);
240 
241     m_data = pcre_compile (buf_str.c_str (), pcre_options,
242                            &err, &erroffset, nullptr);
243 
244     if (! m_data)
245       (*current_liboctave_error_handler)
246         ("%s: %s at position %d of expression", m_who.c_str (), err, erroffset);
247   }
248 
249   regexp::match_data
match(const std::string & buffer)250   regexp::match (const std::string& buffer)
251   {
252     // check if input is valid utf-8
253     const uint8_t *buf_str = reinterpret_cast<const uint8_t *> (buffer.c_str ());
254     if (octave_u8_check_wrapper (buf_str, buffer.length ()))
255       (*current_liboctave_error_handler)
256         ("%s: the input string is invalid UTF-8", m_who.c_str ());
257 
258     regexp::match_data retval;
259 
260     std::list<regexp::match_element> lst;
261 
262     int subpatterns;
263     int namecount;
264     int nameentrysize;
265     char *nametable;
266     std::size_t idx = 0;
267 
268     pcre *re = static_cast<pcre *> (m_data);
269 
270     pcre_fullinfo (re, nullptr, PCRE_INFO_CAPTURECOUNT,  &subpatterns);
271     pcre_fullinfo (re, nullptr, PCRE_INFO_NAMECOUNT, &namecount);
272     pcre_fullinfo (re, nullptr, PCRE_INFO_NAMEENTRYSIZE, &nameentrysize);
273     pcre_fullinfo (re, nullptr, PCRE_INFO_NAMETABLE, &nametable);
274 
275     OCTAVE_LOCAL_BUFFER (int, ovector, (subpatterns+1)*3);
276     OCTAVE_LOCAL_BUFFER (int, nidx, namecount);
277 
278     for (int i = 0; i < namecount; i++)
279       {
280         // Index of subpattern in first two bytes of name (MSB first).
281         // Extract index.
282         nidx[i] = (static_cast<int> (nametable[i*nameentrysize])) << 8
283                   | static_cast<int> (nametable[i*nameentrysize+1]);
284       }
285 
286     while (true)
287       {
288         octave_quit ();
289 
290         int matches = pcre_exec (re, nullptr, buffer.c_str (),
291                                  buffer.length (), idx,
292                                  PCRE_NO_UTF8_CHECK | (idx ? PCRE_NOTBOL : 0),
293                                  ovector, (subpatterns+1)*3);
294 
295         if (matches == PCRE_ERROR_MATCHLIMIT)
296           {
297             // Try harder; start with default value for MATCH_LIMIT
298             // and increase it.
299             (*current_liboctave_warning_with_id_handler)
300               ("Octave:regexp-match-limit",
301                "your pattern caused PCRE to hit its MATCH_LIMIT; trying harder now, but this will be slow");
302 
303             pcre_extra pe;
304 
305             pcre_config (PCRE_CONFIG_MATCH_LIMIT,
306                          static_cast<void *> (&pe.match_limit));
307 
308             pe.flags = PCRE_EXTRA_MATCH_LIMIT;
309 
310             int i = 0;
311             while (matches == PCRE_ERROR_MATCHLIMIT
312                    && i++ < PCRE_MATCHLIMIT_MAX)
313               {
314                 octave_quit ();
315 
316                 pe.match_limit *= 10;
317                 matches = pcre_exec (re, &pe, buffer.c_str (),
318                                      buffer.length (), idx,
319                                      PCRE_NO_UTF8_CHECK
320                                      | (idx ? PCRE_NOTBOL : 0),
321                                      ovector, (subpatterns+1)*3);
322               }
323           }
324 
325         if (matches < 0 && matches != PCRE_ERROR_NOMATCH)
326           (*current_liboctave_error_handler)
327             ("%s: internal error calling pcre_exec; "
328              "error code from pcre_exec is %i", m_who.c_str (), matches);
329 
330         if (matches == PCRE_ERROR_NOMATCH)
331           break;
332         else if (ovector[0] >= ovector[1] && ! m_options.emptymatch ())
333           {
334             // Zero length match.  Skip to next char.
335             idx = ovector[0] + 1;
336             if (idx < buffer.length ())
337               continue;
338             else
339               break;
340           }
341         else
342           {
343             int pos_match = 0;
344             Matrix token_extents (matches-1, 2);
345 
346             for (int i = 1; i < matches; i++)
347               {
348                 if (ovector[2*i] >= 0 && ovector[2*i+1] > 0
349                     && (i == 1 || ovector[2*i] != ovector[2*i-2]
350                         || ovector[2*i-1] != ovector[2*i+1]))
351                   {
352                     token_extents(pos_match,0) = double (ovector[2*i]+1);
353                     token_extents(pos_match++,1) = double (ovector[2*i+1]);
354                   }
355               }
356 
357             token_extents.resize (pos_match, 2);
358 
359             double start = double (ovector[0]+1);
360             double end = double (ovector[1]);
361 
362             const char **listptr;
363             int status = pcre_get_substring_list (buffer.c_str (), ovector,
364                                                   matches, &listptr);
365 
366             if (status == PCRE_ERROR_NOMEMORY)
367               (*current_liboctave_error_handler)
368                 ("%s: cannot allocate memory in pcre_get_substring_list",
369                  m_who.c_str ());
370 
371             // Must use explicit length constructor as match can contain '\0'.
372             std::string match_string = std::string (*listptr, end - start + 1);
373 
374             string_vector tokens (pos_match);
375             string_vector named_tokens (m_names);
376             int pos_offset = 0;
377             pos_match = 0;
378 
379             for (int i = 1; i < matches; i++)
380               {
381                 if (ovector[2*i] >= 0 && ovector[2*i+1] > 0)
382                   {
383                     if (i == 1 || ovector[2*i] != ovector[2*i-2]
384                         || ovector[2*i-1] != ovector[2*i+1])
385                       {
386                         if (namecount > 0)
387                           {
388                             // FIXME: Should probably do this with a map()
389                             //        rather than a linear search.  However,
390                             //        the number of captured, named expressions
391                             //        is usually pretty small (< 4)
392                             for (int j = 0; j < namecount; j++)
393                               {
394                                 if (nidx[j] == i)
395                                   {
396                                     std::size_t len = ovector[2*i+1] - ovector[2*i];
397                                     named_tokens(m_named_idx(j))
398                                       = std::string (*(listptr+i-pos_offset),
399                                                      len);
400                                     break;
401                                   }
402                               }
403                           }
404 
405                         std::size_t len = ovector[2*i+1] - ovector[2*i];
406                         tokens(pos_match++) = std::string (*(listptr+i), len);
407                       }
408                     else
409                       pos_offset++;
410                   }
411               }
412 
413             pcre_free_substring_list (listptr);
414 
415             regexp::match_element new_elem (named_tokens, tokens, match_string,
416                                             token_extents, start, end);
417             lst.push_back (new_elem);
418 
419             if (ovector[1] <= ovector[0])
420               {
421                 // Zero length match.  Skip to next char.
422                 idx = ovector[0] + 1;
423                 if (idx <= buffer.length ())
424                   continue;
425               }
426             else
427               idx = ovector[1];
428 
429             if (m_options.once () || idx >= buffer.length ())
430               break;
431           }
432       }
433 
434     retval = regexp::match_data (lst, m_named_pats);
435 
436     return retval;
437   }
438 
439   bool
is_match(const std::string & buffer)440   regexp::is_match (const std::string& buffer)
441   {
442     regexp::match_data rx_lst = match (buffer);
443 
444     return rx_lst.size () > 0;
445   }
446 
447   Array<bool>
is_match(const string_vector & buffer)448   regexp::is_match (const string_vector& buffer)
449   {
450     octave_idx_type len = buffer.numel ();
451 
452     Array<bool> retval (dim_vector (len, 1));
453 
454     for (octave_idx_type i = 0; i < buffer.numel (); i++)
455       retval(i) = is_match (buffer(i));
456 
457     return retval;
458   }
459 
460   // Declare rep_token_t used in processing replacement string
461   typedef struct
462   {
463     std::size_t pos;
464     int num;
465   } rep_token_t;
466 
467   std::string
replace(const std::string & buffer,const std::string & replacement)468   regexp::replace (const std::string& buffer, const std::string& replacement)
469   {
470     std::string retval;
471 
472     const regexp::match_data rx_lst = match (buffer);
473 
474     std::size_t num_matches = rx_lst.size ();
475 
476     if (num_matches == 0)
477       {
478         retval = buffer;
479         return retval;
480       }
481 
482     // Identify replacement tokens; build a vector of group numbers in
483     // the replacement string so that we can quickly calculate the size
484     // of the replacement.
485 
486     // FIXME: All code assumes that only 10 tokens ($0-$9) exist.
487     //        $11 represents $1 followed by the character '1' rather than
488     //        the eleventh capture buffer.
489 
490     std::string repstr = replacement;
491     std::vector<rep_token_t> tokens;
492     tokens.reserve (5);  // Reserve memory for 5 pattern replacements
493 
494     for (std::size_t i=0; i < repstr.size (); i++)
495       {
496         if (repstr[i] == '\\')
497           {
498             if (i < repstr.size () - 1 && repstr[i+1] == '$')
499               {
500                 repstr.erase (i,1);  // erase backslash
501                 i++;                 // skip over '$'
502                 continue;
503               }
504             if (i < repstr.size () - 1 && repstr[i+1] == '\\')
505               {
506                 repstr.erase (i,1);  // erase 1st backslash
507                 continue;
508               }
509           }
510         else if (repstr[i] == '$')
511           {
512             if (i < repstr.size () - 1 && isdigit (repstr[i+1]))
513               {
514                 rep_token_t tmp_token;
515 
516                 tmp_token.pos = i;
517                 tmp_token.num = repstr[i+1]-'0';
518                 tokens.push_back (tmp_token);
519               }
520           }
521       }
522 
523     std::string rep;
524     int num_tokens = tokens.size ();
525 
526     if (num_tokens > 0)
527       {
528         // Determine replacement length
529         const std::size_t replen = repstr.size () - 2*num_tokens;
530         int delta = 0;
531         auto p = rx_lst.begin ();
532         for (std::size_t i = 0; i < num_matches; i++)
533           {
534             octave_quit ();
535 
536             double start = p->start ();
537             double end = p->end ();
538 
539             const Matrix pairs (p->token_extents ());
540             std::size_t pairlen = 0;
541             for (int j = 0; j < num_tokens; j++)
542               {
543                 if (tokens[j].num == 0)
544                   pairlen += static_cast<std::size_t> (end - start + 1);
545                 else if (tokens[j].num <= pairs.rows ())
546                   pairlen += static_cast<std::size_t> (pairs(tokens[j].num-1,1)
547                                                   - pairs(tokens[j].num-1,0)
548                                                   + 1);
549               }
550             delta += (static_cast<int> (replen + pairlen)
551                       - static_cast<int> (end - start + 1));
552             p++;
553           }
554 
555         // Build replacement string
556         rep.reserve (buffer.size () + delta);
557         std::size_t from = 0;
558         p = rx_lst.begin ();
559         for (std::size_t i = 0; i < num_matches; i++)
560           {
561             octave_quit ();
562 
563             double start = p->start ();
564             double end = p->end ();
565 
566             const Matrix pairs (p->token_extents ());
567             rep.append (&buffer[from], static_cast<std::size_t> (start - 1 - from));
568             from = static_cast<std::size_t> (end);
569 
570             std::size_t cur_pos = 0;
571 
572             for (int j = 0; j < num_tokens; j++)
573               {
574                 rep.append (&repstr[cur_pos], (tokens[j].pos) - cur_pos);
575                 cur_pos = tokens[j].pos+2;
576 
577                 int k = tokens[j].num;
578                 if (k == 0)
579                   {
580                     // replace with entire match
581                     rep.append (&buffer[static_cast<std::size_t> (end - 1)],
582                                 static_cast<std::size_t> (end - start + 1));
583                   }
584                 else if (k <= pairs.rows ())
585                   {
586                     // replace with group capture
587                     rep.append (&buffer[static_cast<std::size_t> (pairs(k-1,0)-1)],
588                                 static_cast<std::size_t> (pairs(k-1,1)
589                                                      - pairs(k-1,0) + 1));
590                   }
591                 else
592                   {
593                     // replace with nothing
594                   }
595               }
596             if (cur_pos < repstr.size ())
597               rep.append (&repstr[cur_pos], repstr.size () - cur_pos);
598 
599             p++;
600           }
601         rep.append (&buffer[from], buffer.size () - from);
602       }
603     else
604       {
605         // Determine repstr length
606         const std::size_t replen = repstr.size ();
607         int delta = 0;
608         auto p = rx_lst.begin ();
609         for (std::size_t i = 0; i < num_matches; i++)
610           {
611             octave_quit ();
612 
613             delta += static_cast<int> (replen)
614                      - static_cast<int> (p->end () - p->start () + 1);
615             p++;
616           }
617 
618         // Build replacement string
619         rep.reserve (buffer.size () + delta);
620         std::size_t from = 0;
621         p = rx_lst.begin ();
622         for (std::size_t i = 0; i < num_matches; i++)
623           {
624             octave_quit ();
625 
626             rep.append (&buffer[from],
627                         static_cast<std::size_t> (p->start () - 1 - from));
628             from = static_cast<std::size_t> (p->end ());
629             rep.append (repstr);
630             p++;
631           }
632         rep.append (&buffer[from], buffer.size () - from);
633       }
634 
635     retval = rep;
636     return retval;
637   }
638 }
639