1#! /usr/bin/env perl
2
3# Program for testing regular expressions with perl to check that PCRE handles
4# them the same. This version needs to have "use utf8" at the start for running
5# the UTF-8 tests, but *not* for the other tests. The only way I've found for
6# doing this is to cat this line in explicitly in the RunPerlTest script. I've
7# also used this method to supply "require Encode" for the UTF-8 tests, so that
8# the main test will still run where Encode is not installed.
9
10#use utf8;
11#require Encode;
12
13# Function for turning a string into a string of printing chars.
14
15sub pchars {
16my($t) = "";
17
18if ($utf8)
19  {
20  @p = unpack('U*', $_[0]);
21  foreach $c (@p)
22    {
23    if ($c >= 32 && $c < 127) { $t .= chr $c; }
24      else { $t .= sprintf("\\x{%02x}", $c);
25      }
26    }
27  }
28else
29  {
30  foreach $c (split(//, $_[0]))
31    {
32    if (ord $c >= 32 && ord $c < 127) { $t .= $c; }
33      else { $t .= sprintf("\\x%02x", ord $c); }
34    }
35  }
36
37$t;
38}
39
40
41# Read lines from named file or stdin and write to named file or stdout; lines
42# consist of a regular expression, in delimiters and optionally followed by
43# options, followed by a set of test data, terminated by an empty line.
44
45# Sort out the input and output files
46
47if (@ARGV > 0)
48  {
49  open(INFILE, "<$ARGV[0]") || die "Failed to open $ARGV[0]\n";
50  $infile = "INFILE";
51  }
52else { $infile = "STDIN"; }
53
54if (@ARGV > 1)
55  {
56  open(OUTFILE, ">$ARGV[1]") || die "Failed to open $ARGV[1]\n";
57  $outfile = "OUTFILE";
58  }
59else { $outfile = "STDOUT"; }
60
61printf($outfile "Perl $] Regular Expressions\n\n");
62
63# Main loop
64
65NEXT_RE:
66for (;;)
67  {
68  printf "  re> " if $infile eq "STDIN";
69  last if ! ($_ = <$infile>);
70  printf $outfile "$_" if $infile ne "STDIN";
71  next if ($_ =~ /^\s*$/ || $_ =~ /^< forbid/);
72
73  $pattern = $_;
74
75  while ($pattern !~ /^\s*(.).*\1/s)
76    {
77    printf "    > " if $infile eq "STDIN";
78    last if ! ($_ = <$infile>);
79    printf $outfile "$_" if $infile ne "STDIN";
80    $pattern .= $_;
81    }
82
83  chomp($pattern);
84  $pattern =~ s/\s+$//;
85
86  # The private /+ modifier means "print $' afterwards".
87
88  $showrest = ($pattern =~ s/\+(?=[a-zA-Z]*$)//);
89
90  # A doubled version is used by pcretest to print remainders after captures
91
92  $pattern =~ s/\+(?=[a-zA-Z]*$)//;
93
94  # Remove /8 from a UTF-8 pattern.
95
96  $utf8 = $pattern =~ s/8(?=[a-zA-Z]*$)//;
97
98  # Remove /J from a pattern with duplicate names.
99
100  $pattern =~ s/J(?=[a-zA-Z]*$)//;
101
102  # Remove /K from a pattern (asks pcretest to check MARK data) */
103
104  $pattern =~ s/K(?=[a-zA-Z]*$)//;
105
106  # /W asks pcretest to set PCRE_UCP; change this to /u for Perl
107
108  $pattern =~ s/W(?=[a-zA-Z]*$)/u/;
109
110  # Remove /S or /SS from a pattern (asks pcretest to study or not to study)
111
112  $pattern =~ s/S(?=[a-zA-Z]*$)//g;
113
114  # Remove /Y and /O from a pattern (disable PCRE optimizations)
115
116  $pattern =~ s/[YO](?=[a-zA-Z]*$)//;
117
118  # Check that the pattern is valid
119
120  eval "\$_ =~ ${pattern}";
121  if ($@)
122    {
123    printf $outfile "Error: $@";
124    if ($infile != "STDIN")
125      {
126      for (;;)
127        {
128        last if ! ($_ = <$infile>);
129        last if $_ =~ /^\s*$/;
130        }
131      }
132    next NEXT_RE;
133    }
134
135  # If the /g modifier is present, we want to put a loop round the matching;
136  # otherwise just a single "if".
137
138  $cmd = ($pattern =~ /g[a-z]*$/)? "while" : "if";
139
140  # If the pattern is actually the null string, Perl uses the most recently
141  # executed (and successfully compiled) regex is used instead. This is a
142  # nasty trap for the unwary! The PCRE test suite does contain null strings
143  # in places - if they are allowed through here all sorts of weird and
144  # unexpected effects happen. To avoid this, we replace such patterns with
145  # a non-null pattern that has the same effect.
146
147  $pattern = "/(?#)/$2" if ($pattern =~ /^(.)\1(.*)$/);
148
149  # Read data lines and test them
150
151  for (;;)
152    {
153    printf "data> " if $infile eq "STDIN";
154    last NEXT_RE if ! ($_ = <$infile>);
155    chomp;
156    printf $outfile "$_\n" if $infile ne "STDIN";
157
158    s/\s+$//;  # Remove trailing space
159    s/^\s+//;  # Remove leading space
160    s/\\Y//g;  # Remove \Y (pcretest flag to set PCRE_NO_START_OPTIMIZE)
161
162    last if ($_ eq "");
163    $x = eval "\"$_\"";   # To get escapes processed
164
165    # Empty array for holding results, ensure $REGERROR and $REGMARK are
166    # unset, then do the matching.
167
168    @subs = ();
169
170    $pushes = "push \@subs,\$&;" .
171         "push \@subs,\$1;" .
172         "push \@subs,\$2;" .
173         "push \@subs,\$3;" .
174         "push \@subs,\$4;" .
175         "push \@subs,\$5;" .
176         "push \@subs,\$6;" .
177         "push \@subs,\$7;" .
178         "push \@subs,\$8;" .
179         "push \@subs,\$9;" .
180         "push \@subs,\$10;" .
181         "push \@subs,\$11;" .
182         "push \@subs,\$12;" .
183         "push \@subs,\$13;" .
184         "push \@subs,\$14;" .
185         "push \@subs,\$15;" .
186         "push \@subs,\$16;" .
187         "push \@subs,\$'; }";
188
189    undef $REGERROR;
190    undef $REGMARK;
191
192    eval "${cmd} (\$x =~ ${pattern}) {" . $pushes;
193
194    if ($@)
195      {
196      printf $outfile "Error: $@\n";
197      next NEXT_RE;
198      }
199    elsif (scalar(@subs) == 0)
200      {
201      printf $outfile "No match";
202      if (defined $REGERROR && $REGERROR != 1)
203        { printf $outfile (", mark = %s", &pchars($REGERROR)); }
204      printf $outfile "\n";
205      }
206    else
207      {
208      while (scalar(@subs) != 0)
209        {
210        printf $outfile (" 0: %s\n", &pchars($subs[0]));
211        printf $outfile (" 0+ %s\n", &pchars($subs[17])) if $showrest;
212        $last_printed = 0;
213        for ($i = 1; $i <= 16; $i++)
214          {
215          if (defined $subs[$i])
216            {
217            while ($last_printed++ < $i-1)
218              { printf $outfile ("%2d: <unset>\n", $last_printed); }
219            printf $outfile ("%2d: %s\n", $i, &pchars($subs[$i]));
220            $last_printed = $i;
221            }
222          }
223        splice(@subs, 0, 18);
224        }
225
226      # It seems that $REGMARK is not marked as UTF-8 even when use utf8 is
227      # set and the input pattern was a UTF-8 string. We can, however, force
228      # it to be so marked.
229
230      if (defined $REGMARK && $REGMARK != 1)
231        {
232        $xx = $REGMARK;
233        $xx = Encode::decode_utf8($xx) if $utf8;
234        printf $outfile ("MK: %s\n", &pchars($xx));
235        }
236      }
237    }
238  }
239
240# printf $outfile "\n";
241
242# End
243