1package TAP::Parser::Grammar;
2
3use strict;
4use warnings;
5
6use TAP::Parser::ResultFactory   ();
7use TAP::Parser::YAMLish::Reader ();
8
9use base 'TAP::Object';
10
11=head1 NAME
12
13TAP::Parser::Grammar - A grammar for the Test Anything Protocol.
14
15=head1 VERSION
16
17Version 3.44
18
19=cut
20
21our $VERSION = '3.44';
22
23=head1 SYNOPSIS
24
25  use TAP::Parser::Grammar;
26  my $grammar = $self->make_grammar({
27    iterator => $tap_parser_iterator,
28    parser   => $tap_parser,
29    version  => 12,
30  });
31
32  my $result = $grammar->tokenize;
33
34=head1 DESCRIPTION
35
36C<TAP::Parser::Grammar> tokenizes lines from a L<TAP::Parser::Iterator> and
37constructs L<TAP::Parser::Result> subclasses to represent the tokens.
38
39Do not attempt to use this class directly.  It won't make sense.  It's mainly
40here to ensure that we will be able to have pluggable grammars when TAP is
41expanded at some future date (plus, this stuff was really cluttering the
42parser).
43
44=head1 METHODS
45
46=head2 Class Methods
47
48=head3 C<new>
49
50  my $grammar = TAP::Parser::Grammar->new({
51      iterator => $iterator,
52      parser   => $parser,
53      version  => $version,
54  });
55
56Returns L<TAP::Parser> grammar object that will parse the TAP stream from the
57specified iterator.  Both C<iterator> and C<parser> are required arguments.
58If C<version> is not set it defaults to C<12> (see L</set_version> for more
59details).
60
61=cut
62
63# new() implementation supplied by TAP::Object
64sub _initialize {
65    my ( $self, $args ) = @_;
66    $self->{iterator} = $args->{iterator};    # TODO: accessor
67    $self->{iterator} ||= $args->{stream};    # deprecated
68    $self->{parser} = $args->{parser};        # TODO: accessor
69    $self->set_version( $args->{version} || 12 );
70    return $self;
71}
72
73my %language_for;
74
75{
76
77    # XXX the 'not' and 'ok' might be on separate lines in VMS ...
78    my $ok  = qr/(?:not )?ok\b/;
79    my $num = qr/\d+/;
80
81    my %v12 = (
82        version => {
83            syntax  => qr/^TAP\s+version\s+(\d+)\s*\z/i,
84            handler => sub {
85                my ( $self, $line ) = @_;
86                my $version = $1;
87                return $self->_make_version_token( $line, $version, );
88            },
89        },
90        plan => {
91            syntax  => qr/^1\.\.(\d+)\s*(.*)\z/,
92            handler => sub {
93                my ( $self, $line ) = @_;
94                my ( $tests_planned, $tail ) = ( $1, $2 );
95                my $explanation = undef;
96                my $skip        = '';
97
98                if ( $tail =~ /^todo((?:\s+\d+)+)/ ) {
99                    my @todo = split /\s+/, _trim($1);
100                    return $self->_make_plan_token(
101                        $line, $tests_planned, 'TODO',
102                        '',    \@todo
103                    );
104                }
105                elsif ( 0 == $tests_planned ) {
106                    $skip = 'SKIP';
107
108                    # If we can't match # SKIP the directive should be undef.
109                    ($explanation) = $tail =~ /^#\s*SKIP\S*\s+(.*)/i;
110                }
111                elsif ( $tail !~ /^\s*$/ ) {
112                    return $self->_make_unknown_token($line);
113                }
114
115                $explanation = '' unless defined $explanation;
116
117                return $self->_make_plan_token(
118                    $line, $tests_planned, $skip,
119                    $explanation, []
120                );
121
122            },
123        },
124
125        # An optimization to handle the most common test lines without
126        # directives.
127        simple_test => {
128            syntax  => qr/^($ok) \ ($num) (?:\ ([^#]+))? \z/x,
129            handler => sub {
130                my ( $self, $line ) = @_;
131                my ( $ok, $num, $desc ) = ( $1, $2, $3 );
132
133                return $self->_make_test_token(
134                    $line, $ok, $num,
135                    $desc
136                );
137            },
138        },
139        test => {
140            syntax  => qr/^($ok) \s* ($num)? \s* (.*) \z/x,
141            handler => sub {
142                my ( $self, $line ) = @_;
143                my ( $ok, $num, $desc ) = ( $1, $2, $3 );
144                my ( $dir, $explanation ) = ( '', '' );
145                if ($desc =~ m/^ ( [^\\\#]* (?: \\. [^\\\#]* )* )
146                       \# \s* (SKIP|TODO) \b \s* (.*) $/ix
147                  )
148                {
149                    ( $desc, $dir, $explanation ) = ( $1, $2, $3 );
150                }
151                return $self->_make_test_token(
152                    $line, $ok, $num, $desc,
153                    $dir,  $explanation
154                );
155            },
156        },
157        comment => {
158            syntax  => qr/^#(.*)/,
159            handler => sub {
160                my ( $self, $line ) = @_;
161                my $comment = $1;
162                return $self->_make_comment_token( $line, $comment );
163            },
164        },
165        bailout => {
166            syntax  => qr/^\s*Bail out!\s*(.*)/,
167            handler => sub {
168                my ( $self, $line ) = @_;
169                my $explanation = $1;
170                return $self->_make_bailout_token(
171                    $line,
172                    $explanation
173                );
174            },
175        },
176    );
177
178    my %v13 = (
179        %v12,
180        plan => {
181            syntax  => qr/^1\.\.(\d+)\s*(?:\s*#\s*SKIP\b(.*))?\z/i,
182            handler => sub {
183                my ( $self, $line ) = @_;
184                my ( $tests_planned, $explanation ) = ( $1, $2 );
185                my $skip
186                  = ( 0 == $tests_planned || defined $explanation )
187                  ? 'SKIP'
188                  : '';
189                $explanation = '' unless defined $explanation;
190                return $self->_make_plan_token(
191                    $line, $tests_planned, $skip,
192                    $explanation, []
193                );
194            },
195        },
196        yaml => {
197            syntax  => qr/^ (\s+) (---.*) $/x,
198            handler => sub {
199                my ( $self, $line ) = @_;
200                my ( $pad, $marker ) = ( $1, $2 );
201                return $self->_make_yaml_token( $pad, $marker );
202            },
203        },
204        pragma => {
205            syntax =>
206              qr/^ pragma \s+ ( [-+] \w+ \s* (?: , \s* [-+] \w+ \s* )* ) $/x,
207            handler => sub {
208                my ( $self, $line ) = @_;
209                my $pragmas = $1;
210                return $self->_make_pragma_token( $line, $pragmas );
211            },
212        },
213    );
214
215    %language_for = (
216        '12' => {
217            tokens => \%v12,
218        },
219        '13' => {
220            tokens => \%v13,
221            setup  => sub {
222                shift->{iterator}->handle_unicode;
223            },
224        },
225    );
226}
227
228##############################################################################
229
230=head2 Instance Methods
231
232=head3 C<set_version>
233
234  $grammar->set_version(13);
235
236Tell the grammar which TAP syntax version to support. The lowest
237supported version is 12. Although 'TAP version' isn't valid version 12
238syntax it is accepted so that higher version numbers may be parsed.
239
240=cut
241
242sub set_version {
243    my $self    = shift;
244    my $version = shift;
245
246    if ( my $language = $language_for{$version} ) {
247        $self->{version} = $version;
248        $self->{tokens}  = $language->{tokens};
249
250        if ( my $setup = $language->{setup} ) {
251            $self->$setup();
252        }
253
254        $self->_order_tokens;
255    }
256    else {
257        require Carp;
258        Carp::croak("Unsupported syntax version: $version");
259    }
260}
261
262# Optimization to put the most frequent tokens first.
263sub _order_tokens {
264    my $self = shift;
265
266    my %copy = %{ $self->{tokens} };
267    my @ordered_tokens = grep {defined}
268      map { delete $copy{$_} } qw( simple_test test comment plan );
269    push @ordered_tokens, values %copy;
270
271    $self->{ordered_tokens} = \@ordered_tokens;
272}
273
274##############################################################################
275
276=head3 C<tokenize>
277
278  my $token = $grammar->tokenize;
279
280This method will return a L<TAP::Parser::Result> object representing the
281current line of TAP.
282
283=cut
284
285sub tokenize {
286    my $self = shift;
287
288    my $line = $self->{iterator}->next;
289    unless ( defined $line ) {
290        delete $self->{parser};    # break circular ref
291        return;
292    }
293
294    my $token;
295
296    for my $token_data ( @{ $self->{ordered_tokens} } ) {
297        if ( $line =~ $token_data->{syntax} ) {
298            my $handler = $token_data->{handler};
299            $token = $self->$handler($line);
300            last;
301        }
302    }
303
304    $token = $self->_make_unknown_token($line) unless $token;
305
306    return $self->{parser}->make_result($token);
307}
308
309##############################################################################
310
311=head3 C<token_types>
312
313  my @types = $grammar->token_types;
314
315Returns the different types of tokens which this grammar can parse.
316
317=cut
318
319sub token_types {
320    my $self = shift;
321    return keys %{ $self->{tokens} };
322}
323
324##############################################################################
325
326=head3 C<syntax_for>
327
328  my $syntax = $grammar->syntax_for($token_type);
329
330Returns a pre-compiled regular expression which will match a chunk of TAP
331corresponding to the token type.  For example (not that you should really pay
332attention to this, C<< $grammar->syntax_for('comment') >> will return
333C<< qr/^#(.*)/ >>.
334
335=cut
336
337sub syntax_for {
338    my ( $self, $type ) = @_;
339    return $self->{tokens}->{$type}->{syntax};
340}
341
342##############################################################################
343
344=head3 C<handler_for>
345
346  my $handler = $grammar->handler_for($token_type);
347
348Returns a code reference which, when passed an appropriate line of TAP,
349returns the lexed token corresponding to that line.  As a result, the basic
350TAP parsing loop looks similar to the following:
351
352 my @tokens;
353 my $grammar = TAP::Grammar->new;
354 LINE: while ( defined( my $line = $parser->_next_chunk_of_tap ) ) {
355     for my $type ( $grammar->token_types ) {
356         my $syntax  = $grammar->syntax_for($type);
357         if ( $line =~ $syntax ) {
358             my $handler = $grammar->handler_for($type);
359             push @tokens => $grammar->$handler($line);
360             next LINE;
361         }
362     }
363     push @tokens => $grammar->_make_unknown_token($line);
364 }
365
366=cut
367
368sub handler_for {
369    my ( $self, $type ) = @_;
370    return $self->{tokens}->{$type}->{handler};
371}
372
373sub _make_version_token {
374    my ( $self, $line, $version ) = @_;
375    return {
376        type    => 'version',
377        raw     => $line,
378        version => $version,
379    };
380}
381
382sub _make_plan_token {
383    my ( $self, $line, $tests_planned, $directive, $explanation, $todo ) = @_;
384
385    if (   $directive eq 'SKIP'
386        && 0 != $tests_planned
387        && $self->{version} < 13 )
388    {
389        warn
390          "Specified SKIP directive in plan but more than 0 tests ($line)\n";
391    }
392
393    return {
394        type          => 'plan',
395        raw           => $line,
396        tests_planned => $tests_planned,
397        directive     => $directive,
398        explanation   => _trim($explanation),
399        todo_list     => $todo,
400    };
401}
402
403sub _make_test_token {
404    my ( $self, $line, $ok, $num, $desc, $dir, $explanation ) = @_;
405    return {
406        ok          => $ok,
407
408        # forcing this to be an integer (and not a string) reduces memory
409        # consumption. RT #84939
410        test_num    => ( defined $num ? 0 + $num : undef ),
411        description => _trim($desc),
412        directive   => ( defined $dir ? uc $dir : '' ),
413        explanation => _trim($explanation),
414        raw         => $line,
415        type        => 'test',
416    };
417}
418
419sub _make_unknown_token {
420    my ( $self, $line ) = @_;
421    return {
422        raw  => $line,
423        type => 'unknown',
424    };
425}
426
427sub _make_comment_token {
428    my ( $self, $line, $comment ) = @_;
429    return {
430        type    => 'comment',
431        raw     => $line,
432        comment => _trim($comment)
433    };
434}
435
436sub _make_bailout_token {
437    my ( $self, $line, $explanation ) = @_;
438    return {
439        type    => 'bailout',
440        raw     => $line,
441        bailout => _trim($explanation)
442    };
443}
444
445sub _make_yaml_token {
446    my ( $self, $pad, $marker ) = @_;
447
448    my $yaml = TAP::Parser::YAMLish::Reader->new;
449
450    my $iterator = $self->{iterator};
451
452    # Construct a reader that reads from our input stripping leading
453    # spaces from each line.
454    my $leader = length($pad);
455    my $strip  = qr{ ^ (\s{$leader}) (.*) $ }x;
456    my @extra  = ($marker);
457    my $reader = sub {
458        return shift @extra if @extra;
459        my $line = $iterator->next;
460        return $2 if $line =~ $strip;
461        return;
462    };
463
464    my $data = $yaml->read($reader);
465
466    # Reconstitute input. This is convoluted. Maybe we should just
467    # record it on the way in...
468    chomp( my $raw = $yaml->get_raw );
469    $raw =~ s/^/$pad/mg;
470
471    return {
472        type => 'yaml',
473        raw  => $raw,
474        data => $data
475    };
476}
477
478sub _make_pragma_token {
479    my ( $self, $line, $pragmas ) = @_;
480    return {
481        type    => 'pragma',
482        raw     => $line,
483        pragmas => [ split /\s*,\s*/, _trim($pragmas) ],
484    };
485}
486
487sub _trim {
488    my $data = shift;
489
490    return '' unless defined $data;
491
492    $data =~ s/^\s+//;
493    $data =~ s/\s+$//;
494    return $data;
495}
496
4971;
498
499=head1 TAP GRAMMAR
500
501B<NOTE:>  This grammar is slightly out of date.  There's still some discussion
502about it and a new one will be provided when we have things better defined.
503
504The L<TAP::Parser> does not use a formal grammar because TAP is essentially a
505stream-based protocol.  In fact, it's quite legal to have an infinite stream.
506For the same reason that we don't apply regexes to streams, we're not using a
507formal grammar here.  Instead, we parse the TAP in lines.
508
509For purposes for forward compatibility, any result which does not match the
510following grammar is currently referred to as
511L<TAP::Parser::Result::Unknown>.  It is I<not> a parse error.
512
513A formal grammar would look similar to the following:
514
515 (*
516     For the time being, I'm cheating on the EBNF by allowing
517     certain terms to be defined by POSIX character classes by
518     using the following syntax:
519
520       digit ::= [:digit:]
521
522     As far as I am aware, that's not valid EBNF.  Sue me.  I
523     didn't know how to write "char" otherwise (Unicode issues).
524     Suggestions welcome.
525 *)
526
527 tap            ::= version? { comment | unknown } leading_plan lines
528                    |
529                    lines trailing_plan {comment}
530
531 version        ::= 'TAP version ' positiveInteger {positiveInteger} "\n"
532
533 leading_plan   ::= plan skip_directive? "\n"
534
535 trailing_plan  ::= plan "\n"
536
537 plan           ::= '1..' nonNegativeInteger
538
539 lines          ::= line {line}
540
541 line           ::= (comment | test | unknown | bailout ) "\n"
542
543 test           ::= status positiveInteger? description? directive?
544
545 status         ::= 'not '? 'ok '
546
547 description    ::= (character - (digit | '#')) {character - '#'}
548
549 directive      ::= todo_directive | skip_directive
550
551 todo_directive ::= hash_mark 'TODO' ' ' {character}
552
553 skip_directive ::= hash_mark 'SKIP' ' ' {character}
554
555 comment        ::= hash_mark {character}
556
557 hash_mark      ::= '#' {' '}
558
559 bailout        ::= 'Bail out!' {character}
560
561 unknown        ::= { (character - "\n") }
562
563 (* POSIX character classes and other terminals *)
564
565 digit              ::= [:digit:]
566 character          ::= ([:print:] - "\n")
567 positiveInteger    ::= ( digit - '0' ) {digit}
568 nonNegativeInteger ::= digit {digit}
569
570=head1 SUBCLASSING
571
572Please see L<TAP::Parser/SUBCLASSING> for a subclassing overview.
573
574If you I<really> want to subclass L<TAP::Parser>'s grammar the best thing to
575do is read through the code.  There's no easy way of summarizing it here.
576
577=head1 SEE ALSO
578
579L<TAP::Object>,
580L<TAP::Parser>,
581L<TAP::Parser::Iterator>,
582L<TAP::Parser::Result>,
583
584=cut
585