1package TAP::Parser::Grammar;
2
3use strict;
4use warnings;
5
6use TAP::Parser::ResultFactory   ();
7use TAP::Parser::YAMLish::Reader ();
8
9use base 'TAP::Object';
10
11=head1 NAME
12
13TAP::Parser::Grammar - A grammar for the Test Anything Protocol.
14
15=head1 VERSION
16
17Version 3.48
18
19=cut
20
21our $VERSION = '3.48';
22
23=head1 SYNOPSIS
24
25  use TAP::Parser::Grammar;
26  my $grammar = $self->make_grammar({
27    iterator => $tap_parser_iterator,
28    parser   => $tap_parser,
29    version  => 12,
30  });
31
32  my $result = $grammar->tokenize;
33
34=head1 DESCRIPTION
35
36C<TAP::Parser::Grammar> tokenizes lines from a L<TAP::Parser::Iterator> and
37constructs L<TAP::Parser::Result> subclasses to represent the tokens.
38
39Do not attempt to use this class directly.  It won't make sense.  It's mainly
40here to ensure that we will be able to have pluggable grammars when TAP is
41expanded at some future date (plus, this stuff was really cluttering the
42parser).
43
44=head1 METHODS
45
46=head2 Class Methods
47
48=head3 C<new>
49
50  my $grammar = TAP::Parser::Grammar->new({
51      iterator => $iterator,
52      parser   => $parser,
53      version  => $version,
54  });
55
56Returns L<TAP::Parser> grammar object that will parse the TAP stream from the
57specified iterator.  Both C<iterator> and C<parser> are required arguments.
58If C<version> is not set it defaults to C<12> (see L</set_version> for more
59details).
60
61=cut
62
63# new() implementation supplied by TAP::Object
64sub _initialize {
65    my ( $self, $args ) = @_;
66    $self->{iterator} = $args->{iterator};    # TODO: accessor
67    $self->{iterator} ||= $args->{stream};    # deprecated
68    $self->{parser} = $args->{parser};        # TODO: accessor
69    $self->set_version( $args->{version} || 12 );
70    return $self;
71}
72
73my %language_for;
74
75{
76
77    # XXX the 'not' and 'ok' might be on separate lines in VMS ...
78    my $ok  = qr/(?:not )?ok\b/;
79    my $num = qr/\d+/;
80
81    my %v12 = (
82        version => {
83            syntax  => qr/^TAP\s+version\s+(\d+)\s*\z/i,
84            handler => sub {
85                my ( $self, $line ) = @_;
86                my $version = $1;
87                return $self->_make_version_token( $line, $version, );
88            },
89        },
90        plan => {
91            syntax  => qr/^1\.\.(\d+)\s*(.*)\z/,
92            handler => sub {
93                my ( $self, $line ) = @_;
94                my ( $tests_planned, $tail ) = ( $1, $2 );
95                my $explanation = undef;
96                my $skip        = '';
97
98                if ( $tail =~ /^todo((?:\s+\d+)+)/ ) {
99                    my @todo = split /\s+/, _trim($1);
100                    return $self->_make_plan_token(
101                        $line, $tests_planned, 'TODO',
102                        '',    \@todo
103                    );
104                }
105                elsif ( 0 == $tests_planned ) {
106                    $skip = 'SKIP';
107
108                    # If we can't match # SKIP the directive should be undef.
109                    ($explanation) = $tail =~ /^#\s*SKIP\S*\s+(.*)/i;
110                }
111                elsif ( $tail !~ /^\s*$/ ) {
112                    return $self->_make_unknown_token($line);
113                }
114
115                $explanation = '' unless defined $explanation;
116
117                return $self->_make_plan_token(
118                    $line, $tests_planned, $skip,
119                    $explanation, []
120                );
121
122            },
123        },
124
125        # An optimization to handle the most common test lines without
126        # directives.
127        simple_test => {
128            syntax  => qr/^($ok) \ ($num) (?:\ ([^#]+))? \z/x,
129            handler => sub {
130                my ( $self, $line ) = @_;
131                my ( $ok, $num, $desc ) = ( $1, $2, $3 );
132
133                return $self->_make_test_token(
134                    $line, $ok, $num,
135                    $desc
136                );
137            },
138        },
139        test => {
140            syntax  => qr/^($ok) \s* ($num)? \s* (.*) \z/x,
141            handler => sub {
142                my ( $self, $line ) = @_;
143                my ( $ok, $num, $desc ) = ( $1, $2, $3 );
144                my ( $dir, $explanation ) = ( '', '' );
145                if ($desc =~ m/^ ( [^\\\#]* (?: \\. [^\\\#]* )* )
146                       \# \s* (SKIP|TODO) \b \s* (.*) $/ix
147                  )
148                {
149                    ( $desc, $dir, $explanation ) = ( $1, $2, $3 );
150                }
151                return $self->_make_test_token(
152                    $line, $ok, $num, $desc,
153                    $dir,  $explanation
154                );
155            },
156        },
157        comment => {
158            syntax  => qr/^#(.*)/,
159            handler => sub {
160                my ( $self, $line ) = @_;
161                my $comment = $1;
162                return $self->_make_comment_token( $line, $comment );
163            },
164        },
165        bailout => {
166            syntax  => qr/^\s*Bail out!\s*(.*)/,
167            handler => sub {
168                my ( $self, $line ) = @_;
169                my $explanation = $1;
170                return $self->_make_bailout_token(
171                    $line,
172                    $explanation
173                );
174            },
175        },
176    );
177
178    my %v13 = (
179        %v12,
180        plan => {
181            syntax  => qr/^1\.\.(\d+)\s*(?:\s*#\s*SKIP\b(.*))?\z/i,
182            handler => sub {
183                my ( $self, $line ) = @_;
184                my ( $tests_planned, $explanation ) = ( $1, $2 );
185                my $skip
186                  = ( 0 == $tests_planned || defined $explanation )
187                  ? 'SKIP'
188                  : '';
189                $explanation = '' unless defined $explanation;
190                return $self->_make_plan_token(
191                    $line, $tests_planned, $skip,
192                    $explanation, []
193                );
194            },
195        },
196        yaml => {
197            syntax  => qr/^ (\s+) (---.*) $/x,
198            handler => sub {
199                my ( $self, $line ) = @_;
200                my ( $pad, $marker ) = ( $1, $2 );
201                return $self->_make_yaml_token( $pad, $marker );
202            },
203        },
204        pragma => {
205            syntax =>
206              qr/^ pragma \s+ ( [-+] \w+ \s* (?: , \s* [-+] \w+ \s* )* ) $/x,
207            handler => sub {
208                my ( $self, $line ) = @_;
209                my $pragmas = $1;
210                return $self->_make_pragma_token( $line, $pragmas );
211            },
212        },
213    );
214
215    %language_for = (
216        '12' => {
217            tokens => \%v12,
218        },
219        '13' => {
220            tokens => \%v13,
221            setup  => sub {
222                shift->{iterator}->handle_unicode;
223            },
224        },
225        '14' => {
226            tokens => \%v13,
227            setup  => sub {
228                shift->{iterator}->handle_unicode;
229            },
230        },
231    );
232}
233
234##############################################################################
235
236=head2 Instance Methods
237
238=head3 C<set_version>
239
240  $grammar->set_version(13);
241
242Tell the grammar which TAP syntax version to support. The lowest
243supported version is 12. Although 'TAP version' isn't valid version 12
244syntax it is accepted so that higher version numbers may be parsed.
245
246=cut
247
248sub set_version {
249    my $self    = shift;
250    my $version = shift;
251
252    if ( my $language = $language_for{$version} ) {
253        $self->{version} = $version;
254        $self->{tokens}  = $language->{tokens};
255
256        if ( my $setup = $language->{setup} ) {
257            $self->$setup();
258        }
259
260        $self->_order_tokens;
261    }
262    else {
263        require Carp;
264        Carp::croak("Unsupported syntax version: $version");
265    }
266}
267
268# Optimization to put the most frequent tokens first.
269sub _order_tokens {
270    my $self = shift;
271
272    my %copy = %{ $self->{tokens} };
273    my @ordered_tokens = grep {defined}
274      map { delete $copy{$_} } qw( simple_test test comment plan );
275    push @ordered_tokens, values %copy;
276
277    $self->{ordered_tokens} = \@ordered_tokens;
278}
279
280##############################################################################
281
282=head3 C<tokenize>
283
284  my $token = $grammar->tokenize;
285
286This method will return a L<TAP::Parser::Result> object representing the
287current line of TAP.
288
289=cut
290
291sub tokenize {
292    my $self = shift;
293
294    my $line = $self->{iterator}->next;
295    unless ( defined $line ) {
296        delete $self->{parser};    # break circular ref
297        return;
298    }
299
300    my $token;
301
302    for my $token_data ( @{ $self->{ordered_tokens} } ) {
303        if ( $line =~ $token_data->{syntax} ) {
304            my $handler = $token_data->{handler};
305            $token = $self->$handler($line);
306            last;
307        }
308    }
309
310    $token = $self->_make_unknown_token($line) unless $token;
311
312    return $self->{parser}->make_result($token);
313}
314
315##############################################################################
316
317=head3 C<token_types>
318
319  my @types = $grammar->token_types;
320
321Returns the different types of tokens which this grammar can parse.
322
323=cut
324
325sub token_types {
326    my $self = shift;
327    return keys %{ $self->{tokens} };
328}
329
330##############################################################################
331
332=head3 C<syntax_for>
333
334  my $syntax = $grammar->syntax_for($token_type);
335
336Returns a pre-compiled regular expression which will match a chunk of TAP
337corresponding to the token type.  For example (not that you should really pay
338attention to this, C<< $grammar->syntax_for('comment') >> will return
339C<< qr/^#(.*)/ >>.
340
341=cut
342
343sub syntax_for {
344    my ( $self, $type ) = @_;
345    return $self->{tokens}->{$type}->{syntax};
346}
347
348##############################################################################
349
350=head3 C<handler_for>
351
352  my $handler = $grammar->handler_for($token_type);
353
354Returns a code reference which, when passed an appropriate line of TAP,
355returns the lexed token corresponding to that line.  As a result, the basic
356TAP parsing loop looks similar to the following:
357
358 my @tokens;
359 my $grammar = TAP::Grammar->new;
360 LINE: while ( defined( my $line = $parser->_next_chunk_of_tap ) ) {
361     for my $type ( $grammar->token_types ) {
362         my $syntax  = $grammar->syntax_for($type);
363         if ( $line =~ $syntax ) {
364             my $handler = $grammar->handler_for($type);
365             push @tokens => $grammar->$handler($line);
366             next LINE;
367         }
368     }
369     push @tokens => $grammar->_make_unknown_token($line);
370 }
371
372=cut
373
374sub handler_for {
375    my ( $self, $type ) = @_;
376    return $self->{tokens}->{$type}->{handler};
377}
378
379sub _make_version_token {
380    my ( $self, $line, $version ) = @_;
381    return {
382        type    => 'version',
383        raw     => $line,
384        version => $version,
385    };
386}
387
388sub _make_plan_token {
389    my ( $self, $line, $tests_planned, $directive, $explanation, $todo ) = @_;
390
391    if (   $directive eq 'SKIP'
392        && 0 != $tests_planned
393        && $self->{version} < 13 )
394    {
395        warn
396          "Specified SKIP directive in plan but more than 0 tests ($line)\n";
397    }
398
399    return {
400        type          => 'plan',
401        raw           => $line,
402        tests_planned => $tests_planned,
403        directive     => $directive,
404        explanation   => _trim($explanation),
405        todo_list     => $todo,
406    };
407}
408
409sub _make_test_token {
410    my ( $self, $line, $ok, $num, $desc, $dir, $explanation ) = @_;
411    return {
412        ok          => $ok,
413
414        # forcing this to be an integer (and not a string) reduces memory
415        # consumption. RT #84939
416        test_num    => ( defined $num ? 0 + $num : undef ),
417        description => _trim($desc),
418        directive   => ( defined $dir ? uc $dir : '' ),
419        explanation => _trim($explanation),
420        raw         => $line,
421        type        => 'test',
422    };
423}
424
425sub _make_unknown_token {
426    my ( $self, $line ) = @_;
427    return {
428        raw  => $line,
429        type => 'unknown',
430    };
431}
432
433sub _make_comment_token {
434    my ( $self, $line, $comment ) = @_;
435    return {
436        type    => 'comment',
437        raw     => $line,
438        comment => _trim($comment)
439    };
440}
441
442sub _make_bailout_token {
443    my ( $self, $line, $explanation ) = @_;
444    return {
445        type    => 'bailout',
446        raw     => $line,
447        bailout => _trim($explanation)
448    };
449}
450
451sub _make_yaml_token {
452    my ( $self, $pad, $marker ) = @_;
453
454    my $yaml = TAP::Parser::YAMLish::Reader->new;
455
456    my $iterator = $self->{iterator};
457
458    # Construct a reader that reads from our input stripping leading
459    # spaces from each line.
460    my $leader = length($pad);
461    my $strip  = qr{ ^ (\s{$leader}) (.*) $ }x;
462    my @extra  = ($marker);
463    my $reader = sub {
464        return shift @extra if @extra;
465        my $line = $iterator->next;
466        return $2 if $line =~ $strip;
467        return;
468    };
469
470    my $data = $yaml->read($reader);
471
472    # Reconstitute input. This is convoluted. Maybe we should just
473    # record it on the way in...
474    chomp( my $raw = $yaml->get_raw );
475    $raw =~ s/^/$pad/mg;
476
477    return {
478        type => 'yaml',
479        raw  => $raw,
480        data => $data
481    };
482}
483
484sub _make_pragma_token {
485    my ( $self, $line, $pragmas ) = @_;
486    return {
487        type    => 'pragma',
488        raw     => $line,
489        pragmas => [ split /\s*,\s*/, _trim($pragmas) ],
490    };
491}
492
493sub _trim {
494    my $data = shift;
495
496    return '' unless defined $data;
497
498    $data =~ s/^\s+//;
499    $data =~ s/\s+$//;
500    return $data;
501}
502
5031;
504
505=head1 TAP GRAMMAR
506
507B<NOTE:>  This grammar is slightly out of date.  There's still some discussion
508about it and a new one will be provided when we have things better defined.
509
510The L<TAP::Parser> does not use a formal grammar because TAP is essentially a
511stream-based protocol.  In fact, it's quite legal to have an infinite stream.
512For the same reason that we don't apply regexes to streams, we're not using a
513formal grammar here.  Instead, we parse the TAP in lines.
514
515For purposes for forward compatibility, any result which does not match the
516following grammar is currently referred to as
517L<TAP::Parser::Result::Unknown>.  It is I<not> a parse error.
518
519A formal grammar would look similar to the following:
520
521 (*
522     For the time being, I'm cheating on the EBNF by allowing
523     certain terms to be defined by POSIX character classes by
524     using the following syntax:
525
526       digit ::= [:digit:]
527
528     As far as I am aware, that's not valid EBNF.  Sue me.  I
529     didn't know how to write "char" otherwise (Unicode issues).
530     Suggestions welcome.
531 *)
532
533 tap            ::= version? { comment | unknown } leading_plan lines
534                    |
535                    lines trailing_plan {comment}
536
537 version        ::= 'TAP version ' positiveInteger {positiveInteger} "\n"
538
539 leading_plan   ::= plan skip_directive? "\n"
540
541 trailing_plan  ::= plan "\n"
542
543 plan           ::= '1..' nonNegativeInteger
544
545 lines          ::= line {line}
546
547 line           ::= (comment | test | unknown | bailout ) "\n"
548
549 test           ::= status positiveInteger? description? directive?
550
551 status         ::= 'not '? 'ok '
552
553 description    ::= (character - (digit | '#')) {character - '#'}
554
555 directive      ::= todo_directive | skip_directive
556
557 todo_directive ::= hash_mark 'TODO' ' ' {character}
558
559 skip_directive ::= hash_mark 'SKIP' ' ' {character}
560
561 comment        ::= hash_mark {character}
562
563 hash_mark      ::= '#' {' '}
564
565 bailout        ::= 'Bail out!' {character}
566
567 unknown        ::= { (character - "\n") }
568
569 (* POSIX character classes and other terminals *)
570
571 digit              ::= [:digit:]
572 character          ::= ([:print:] - "\n")
573 positiveInteger    ::= ( digit - '0' ) {digit}
574 nonNegativeInteger ::= digit {digit}
575
576=head1 SUBCLASSING
577
578Please see L<TAP::Parser/SUBCLASSING> for a subclassing overview.
579
580If you I<really> want to subclass L<TAP::Parser>'s grammar the best thing to
581do is read through the code.  There's no easy way of summarizing it here.
582
583=head1 SEE ALSO
584
585L<TAP::Object>,
586L<TAP::Parser>,
587L<TAP::Parser::Iterator>,
588L<TAP::Parser::Result>,
589
590=cut
591