1package TAP::Parser::Grammar;
2
3use strict;
4use vars qw($VERSION @ISA);
5
6use TAP::Object                  ();
7use TAP::Parser::ResultFactory   ();
8use TAP::Parser::YAMLish::Reader ();
9
10@ISA = qw(TAP::Object);
11
12=head1 NAME
13
14TAP::Parser::Grammar - A grammar for the Test Anything Protocol.
15
16=head1 VERSION
17
18Version 3.26
19
20=cut
21
22$VERSION = '3.26';
23
24=head1 SYNOPSIS
25
26  use TAP::Parser::Grammar;
27  my $grammar = $self->make_grammar({
28    iterator => $tap_parser_iterator,
29    parser   => $tap_parser,
30    version  => 12,
31  });
32
33  my $result = $grammar->tokenize;
34
35=head1 DESCRIPTION
36
37C<TAP::Parser::Grammar> tokenizes lines from a L<TAP::Parser::Iterator> and
38constructs L<TAP::Parser::Result> subclasses to represent the tokens.
39
40Do not attempt to use this class directly.  It won't make sense.  It's mainly
41here to ensure that we will be able to have pluggable grammars when TAP is
42expanded at some future date (plus, this stuff was really cluttering the
43parser).
44
45=head1 METHODS
46
47=head2 Class Methods
48
49=head3 C<new>
50
51  my $grammar = TAP::Parser::Grammar->new({
52      iterator => $iterator,
53      parser   => $parser,
54      version  => $version,
55  });
56
57Returns L<TAP::Parser> grammar object that will parse the TAP stream from the
58specified iterator.  Both C<iterator> and C<parser> are required arguments.
59If C<version> is not set it defaults to C<12> (see L</set_version> for more
60details).
61
62=cut
63
64# new() implementation supplied by TAP::Object
65sub _initialize {
66    my ( $self, $args ) = @_;
67    $self->{iterator} = $args->{iterator};    # TODO: accessor
68    $self->{iterator} ||= $args->{stream};    # deprecated
69    $self->{parser} = $args->{parser};        # TODO: accessor
70    $self->set_version( $args->{version} || 12 );
71    return $self;
72}
73
74my %language_for;
75
76{
77
78    # XXX the 'not' and 'ok' might be on separate lines in VMS ...
79    my $ok  = qr/(?:not )?ok\b/;
80    my $num = qr/\d+/;
81
82    my %v12 = (
83        version => {
84            syntax  => qr/^TAP\s+version\s+(\d+)\s*\z/i,
85            handler => sub {
86                my ( $self, $line ) = @_;
87                my $version = $1;
88                return $self->_make_version_token( $line, $version, );
89            },
90        },
91        plan => {
92            syntax  => qr/^1\.\.(\d+)\s*(.*)\z/,
93            handler => sub {
94                my ( $self, $line ) = @_;
95                my ( $tests_planned, $tail ) = ( $1, $2 );
96                my $explanation = undef;
97                my $skip        = '';
98
99                if ( $tail =~ /^todo((?:\s+\d+)+)/ ) {
100                    my @todo = split /\s+/, _trim($1);
101                    return $self->_make_plan_token(
102                        $line, $tests_planned, 'TODO',
103                        '',    \@todo
104                    );
105                }
106                elsif ( 0 == $tests_planned ) {
107                    $skip = 'SKIP';
108
109                    # If we can't match # SKIP the directive should be undef.
110                    ($explanation) = $tail =~ /^#\s*SKIP\S*\s+(.*)/i;
111                }
112                elsif ( $tail !~ /^\s*$/ ) {
113                    return $self->_make_unknown_token($line);
114                }
115
116                $explanation = '' unless defined $explanation;
117
118                return $self->_make_plan_token(
119                    $line, $tests_planned, $skip,
120                    $explanation, []
121                );
122
123            },
124        },
125
126        # An optimization to handle the most common test lines without
127        # directives.
128        simple_test => {
129            syntax  => qr/^($ok) \ ($num) (?:\ ([^#]+))? \z/x,
130            handler => sub {
131                my ( $self, $line ) = @_;
132                my ( $ok, $num, $desc ) = ( $1, $2, $3 );
133
134                return $self->_make_test_token(
135                    $line, $ok, $num,
136                    $desc
137                );
138            },
139        },
140        test => {
141            syntax  => qr/^($ok) \s* ($num)? \s* (.*) \z/x,
142            handler => sub {
143                my ( $self, $line ) = @_;
144                my ( $ok, $num, $desc ) = ( $1, $2, $3 );
145                my ( $dir, $explanation ) = ( '', '' );
146                if ($desc =~ m/^ ( [^\\\#]* (?: \\. [^\\\#]* )* )
147                       \# \s* (SKIP|TODO) \b \s* (.*) $/ix
148                  )
149                {
150                    ( $desc, $dir, $explanation ) = ( $1, $2, $3 );
151                }
152                return $self->_make_test_token(
153                    $line, $ok, $num, $desc,
154                    $dir,  $explanation
155                );
156            },
157        },
158        comment => {
159            syntax  => qr/^#(.*)/,
160            handler => sub {
161                my ( $self, $line ) = @_;
162                my $comment = $1;
163                return $self->_make_comment_token( $line, $comment );
164            },
165        },
166        bailout => {
167            syntax  => qr/^\s*Bail out!\s*(.*)/,
168            handler => sub {
169                my ( $self, $line ) = @_;
170                my $explanation = $1;
171                return $self->_make_bailout_token(
172                    $line,
173                    $explanation
174                );
175            },
176        },
177    );
178
179    my %v13 = (
180        %v12,
181        plan => {
182            syntax  => qr/^1\.\.(\d+)(?:\s*#\s*SKIP\b(.*))?\z/i,
183            handler => sub {
184                my ( $self, $line ) = @_;
185                my ( $tests_planned, $explanation ) = ( $1, $2 );
186                my $skip
187                  = ( 0 == $tests_planned || defined $explanation )
188                  ? 'SKIP'
189                  : '';
190                $explanation = '' unless defined $explanation;
191                return $self->_make_plan_token(
192                    $line, $tests_planned, $skip,
193                    $explanation, []
194                );
195            },
196        },
197        yaml => {
198            syntax  => qr/^ (\s+) (---.*) $/x,
199            handler => sub {
200                my ( $self, $line ) = @_;
201                my ( $pad, $marker ) = ( $1, $2 );
202                return $self->_make_yaml_token( $pad, $marker );
203            },
204        },
205        pragma => {
206            syntax =>
207              qr/^ pragma \s+ ( [-+] \w+ \s* (?: , \s* [-+] \w+ \s* )* ) $/x,
208            handler => sub {
209                my ( $self, $line ) = @_;
210                my $pragmas = $1;
211                return $self->_make_pragma_token( $line, $pragmas );
212            },
213        },
214    );
215
216    %language_for = (
217        '12' => {
218            tokens => \%v12,
219        },
220        '13' => {
221            tokens => \%v13,
222            setup  => sub {
223                shift->{iterator}->handle_unicode;
224            },
225        },
226    );
227}
228
229##############################################################################
230
231=head2 Instance Methods
232
233=head3 C<set_version>
234
235  $grammar->set_version(13);
236
237Tell the grammar which TAP syntax version to support. The lowest
238supported version is 12. Although 'TAP version' isn't valid version 12
239syntax it is accepted so that higher version numbers may be parsed.
240
241=cut
242
243sub set_version {
244    my $self    = shift;
245    my $version = shift;
246
247    if ( my $language = $language_for{$version} ) {
248        $self->{version} = $version;
249        $self->{tokens}  = $language->{tokens};
250
251        if ( my $setup = $language->{setup} ) {
252            $self->$setup();
253        }
254
255        $self->_order_tokens;
256    }
257    else {
258        require Carp;
259        Carp::croak("Unsupported syntax version: $version");
260    }
261}
262
263# Optimization to put the most frequent tokens first.
264sub _order_tokens {
265    my $self = shift;
266
267    my %copy = %{ $self->{tokens} };
268    my @ordered_tokens = grep {defined}
269      map { delete $copy{$_} } qw( simple_test test comment plan );
270    push @ordered_tokens, values %copy;
271
272    $self->{ordered_tokens} = \@ordered_tokens;
273}
274
275##############################################################################
276
277=head3 C<tokenize>
278
279  my $token = $grammar->tokenize;
280
281This method will return a L<TAP::Parser::Result> object representing the
282current line of TAP.
283
284=cut
285
286sub tokenize {
287    my $self = shift;
288
289    my $line = $self->{iterator}->next;
290    unless ( defined $line ) {
291        delete $self->{parser};    # break circular ref
292        return;
293    }
294
295    my $token;
296
297    for my $token_data ( @{ $self->{ordered_tokens} } ) {
298        if ( $line =~ $token_data->{syntax} ) {
299            my $handler = $token_data->{handler};
300            $token = $self->$handler($line);
301            last;
302        }
303    }
304
305    $token = $self->_make_unknown_token($line) unless $token;
306
307    return $self->{parser}->make_result($token);
308}
309
310##############################################################################
311
312=head3 C<token_types>
313
314  my @types = $grammar->token_types;
315
316Returns the different types of tokens which this grammar can parse.
317
318=cut
319
320sub token_types {
321    my $self = shift;
322    return keys %{ $self->{tokens} };
323}
324
325##############################################################################
326
327=head3 C<syntax_for>
328
329  my $syntax = $grammar->syntax_for($token_type);
330
331Returns a pre-compiled regular expression which will match a chunk of TAP
332corresponding to the token type.  For example (not that you should really pay
333attention to this, C<< $grammar->syntax_for('comment') >> will return
334C<< qr/^#(.*)/ >>.
335
336=cut
337
338sub syntax_for {
339    my ( $self, $type ) = @_;
340    return $self->{tokens}->{$type}->{syntax};
341}
342
343##############################################################################
344
345=head3 C<handler_for>
346
347  my $handler = $grammar->handler_for($token_type);
348
349Returns a code reference which, when passed an appropriate line of TAP,
350returns the lexed token corresponding to that line.  As a result, the basic
351TAP parsing loop looks similar to the following:
352
353 my @tokens;
354 my $grammar = TAP::Grammar->new;
355 LINE: while ( defined( my $line = $parser->_next_chunk_of_tap ) ) {
356     for my $type ( $grammar->token_types ) {
357         my $syntax  = $grammar->syntax_for($type);
358         if ( $line =~ $syntax ) {
359             my $handler = $grammar->handler_for($type);
360             push @tokens => $grammar->$handler($line);
361             next LINE;
362         }
363     }
364     push @tokens => $grammar->_make_unknown_token($line);
365 }
366
367=cut
368
369sub handler_for {
370    my ( $self, $type ) = @_;
371    return $self->{tokens}->{$type}->{handler};
372}
373
374sub _make_version_token {
375    my ( $self, $line, $version ) = @_;
376    return {
377        type    => 'version',
378        raw     => $line,
379        version => $version,
380    };
381}
382
383sub _make_plan_token {
384    my ( $self, $line, $tests_planned, $directive, $explanation, $todo ) = @_;
385
386    if (   $directive eq 'SKIP'
387        && 0 != $tests_planned
388        && $self->{version} < 13 )
389    {
390        warn
391          "Specified SKIP directive in plan but more than 0 tests ($line)\n";
392    }
393
394    return {
395        type          => 'plan',
396        raw           => $line,
397        tests_planned => $tests_planned,
398        directive     => $directive,
399        explanation   => _trim($explanation),
400        todo_list     => $todo,
401    };
402}
403
404sub _make_test_token {
405    my ( $self, $line, $ok, $num, $desc, $dir, $explanation ) = @_;
406    return {
407        ok          => $ok,
408        test_num    => $num,
409        description => _trim($desc),
410        directive   => ( defined $dir ? uc $dir : '' ),
411        explanation => _trim($explanation),
412        raw         => $line,
413        type        => 'test',
414    };
415}
416
417sub _make_unknown_token {
418    my ( $self, $line ) = @_;
419    return {
420        raw  => $line,
421        type => 'unknown',
422    };
423}
424
425sub _make_comment_token {
426    my ( $self, $line, $comment ) = @_;
427    return {
428        type    => 'comment',
429        raw     => $line,
430        comment => _trim($comment)
431    };
432}
433
434sub _make_bailout_token {
435    my ( $self, $line, $explanation ) = @_;
436    return {
437        type    => 'bailout',
438        raw     => $line,
439        bailout => _trim($explanation)
440    };
441}
442
443sub _make_yaml_token {
444    my ( $self, $pad, $marker ) = @_;
445
446    my $yaml = TAP::Parser::YAMLish::Reader->new;
447
448    my $iterator = $self->{iterator};
449
450    # Construct a reader that reads from our input stripping leading
451    # spaces from each line.
452    my $leader = length($pad);
453    my $strip  = qr{ ^ (\s{$leader}) (.*) $ }x;
454    my @extra  = ($marker);
455    my $reader = sub {
456        return shift @extra if @extra;
457        my $line = $iterator->next;
458        return $2 if $line =~ $strip;
459        return;
460    };
461
462    my $data = $yaml->read($reader);
463
464    # Reconstitute input. This is convoluted. Maybe we should just
465    # record it on the way in...
466    chomp( my $raw = $yaml->get_raw );
467    $raw =~ s/^/$pad/mg;
468
469    return {
470        type => 'yaml',
471        raw  => $raw,
472        data => $data
473    };
474}
475
476sub _make_pragma_token {
477    my ( $self, $line, $pragmas ) = @_;
478    return {
479        type    => 'pragma',
480        raw     => $line,
481        pragmas => [ split /\s*,\s*/, _trim($pragmas) ],
482    };
483}
484
485sub _trim {
486    my $data = shift;
487
488    return '' unless defined $data;
489
490    $data =~ s/^\s+//;
491    $data =~ s/\s+$//;
492    return $data;
493}
494
4951;
496
497=head1 TAP GRAMMAR
498
499B<NOTE:>  This grammar is slightly out of date.  There's still some discussion
500about it and a new one will be provided when we have things better defined.
501
502The L<TAP::Parser> does not use a formal grammar because TAP is essentially a
503stream-based protocol.  In fact, it's quite legal to have an infinite stream.
504For the same reason that we don't apply regexes to streams, we're not using a
505formal grammar here.  Instead, we parse the TAP in lines.
506
507For purposes for forward compatibility, any result which does not match the
508following grammar is currently referred to as
509L<TAP::Parser::Result::Unknown>.  It is I<not> a parse error.
510
511A formal grammar would look similar to the following:
512
513 (*
514     For the time being, I'm cheating on the EBNF by allowing
515     certain terms to be defined by POSIX character classes by
516     using the following syntax:
517
518       digit ::= [:digit:]
519
520     As far as I am aware, that's not valid EBNF.  Sue me.  I
521     didn't know how to write "char" otherwise (Unicode issues).
522     Suggestions welcome.
523 *)
524
525 tap            ::= version? { comment | unknown } leading_plan lines
526                    |
527                    lines trailing_plan {comment}
528
529 version        ::= 'TAP version ' positiveInteger {positiveInteger} "\n"
530
531 leading_plan   ::= plan skip_directive? "\n"
532
533 trailing_plan  ::= plan "\n"
534
535 plan           ::= '1..' nonNegativeInteger
536
537 lines          ::= line {line}
538
539 line           ::= (comment | test | unknown | bailout ) "\n"
540
541 test           ::= status positiveInteger? description? directive?
542
543 status         ::= 'not '? 'ok '
544
545 description    ::= (character - (digit | '#')) {character - '#'}
546
547 directive      ::= todo_directive | skip_directive
548
549 todo_directive ::= hash_mark 'TODO' ' ' {character}
550
551 skip_directive ::= hash_mark 'SKIP' ' ' {character}
552
553 comment        ::= hash_mark {character}
554
555 hash_mark      ::= '#' {' '}
556
557 bailout        ::= 'Bail out!' {character}
558
559 unknown        ::= { (character - "\n") }
560
561 (* POSIX character classes and other terminals *)
562
563 digit              ::= [:digit:]
564 character          ::= ([:print:] - "\n")
565 positiveInteger    ::= ( digit - '0' ) {digit}
566 nonNegativeInteger ::= digit {digit}
567
568=head1 SUBCLASSING
569
570Please see L<TAP::Parser/SUBCLASSING> for a subclassing overview.
571
572If you I<really> want to subclass L<TAP::Parser>'s grammar the best thing to
573do is read through the code.  There's no easy way of summarizing it here.
574
575=head1 SEE ALSO
576
577L<TAP::Object>,
578L<TAP::Parser>,
579L<TAP::Parser::Iterator>,
580L<TAP::Parser::Result>,
581
582=cut
583