1package TAP::Parser::Grammar; 2 3use strict; 4use warnings; 5 6use TAP::Parser::ResultFactory (); 7use TAP::Parser::YAMLish::Reader (); 8 9use base 'TAP::Object'; 10 11=head1 NAME 12 13TAP::Parser::Grammar - A grammar for the Test Anything Protocol. 14 15=head1 VERSION 16 17Version 3.48 18 19=cut 20 21our $VERSION = '3.48'; 22 23=head1 SYNOPSIS 24 25 use TAP::Parser::Grammar; 26 my $grammar = $self->make_grammar({ 27 iterator => $tap_parser_iterator, 28 parser => $tap_parser, 29 version => 12, 30 }); 31 32 my $result = $grammar->tokenize; 33 34=head1 DESCRIPTION 35 36C<TAP::Parser::Grammar> tokenizes lines from a L<TAP::Parser::Iterator> and 37constructs L<TAP::Parser::Result> subclasses to represent the tokens. 38 39Do not attempt to use this class directly. It won't make sense. It's mainly 40here to ensure that we will be able to have pluggable grammars when TAP is 41expanded at some future date (plus, this stuff was really cluttering the 42parser). 43 44=head1 METHODS 45 46=head2 Class Methods 47 48=head3 C<new> 49 50 my $grammar = TAP::Parser::Grammar->new({ 51 iterator => $iterator, 52 parser => $parser, 53 version => $version, 54 }); 55 56Returns L<TAP::Parser> grammar object that will parse the TAP stream from the 57specified iterator. Both C<iterator> and C<parser> are required arguments. 58If C<version> is not set it defaults to C<12> (see L</set_version> for more 59details). 60 61=cut 62 63# new() implementation supplied by TAP::Object 64sub _initialize { 65 my ( $self, $args ) = @_; 66 $self->{iterator} = $args->{iterator}; # TODO: accessor 67 $self->{iterator} ||= $args->{stream}; # deprecated 68 $self->{parser} = $args->{parser}; # TODO: accessor 69 $self->set_version( $args->{version} || 12 ); 70 return $self; 71} 72 73my %language_for; 74 75{ 76 77 # XXX the 'not' and 'ok' might be on separate lines in VMS ... 78 my $ok = qr/(?:not )?ok\b/; 79 my $num = qr/\d+/; 80 81 my %v12 = ( 82 version => { 83 syntax => qr/^TAP\s+version\s+(\d+)\s*\z/i, 84 handler => sub { 85 my ( $self, $line ) = @_; 86 my $version = $1; 87 return $self->_make_version_token( $line, $version, ); 88 }, 89 }, 90 plan => { 91 syntax => qr/^1\.\.(\d+)\s*(.*)\z/, 92 handler => sub { 93 my ( $self, $line ) = @_; 94 my ( $tests_planned, $tail ) = ( $1, $2 ); 95 my $explanation = undef; 96 my $skip = ''; 97 98 if ( $tail =~ /^todo((?:\s+\d+)+)/ ) { 99 my @todo = split /\s+/, _trim($1); 100 return $self->_make_plan_token( 101 $line, $tests_planned, 'TODO', 102 '', \@todo 103 ); 104 } 105 elsif ( 0 == $tests_planned ) { 106 $skip = 'SKIP'; 107 108 # If we can't match # SKIP the directive should be undef. 109 ($explanation) = $tail =~ /^#\s*SKIP\S*\s+(.*)/i; 110 } 111 elsif ( $tail !~ /^\s*$/ ) { 112 return $self->_make_unknown_token($line); 113 } 114 115 $explanation = '' unless defined $explanation; 116 117 return $self->_make_plan_token( 118 $line, $tests_planned, $skip, 119 $explanation, [] 120 ); 121 122 }, 123 }, 124 125 # An optimization to handle the most common test lines without 126 # directives. 127 simple_test => { 128 syntax => qr/^($ok) \ ($num) (?:\ ([^#]+))? \z/x, 129 handler => sub { 130 my ( $self, $line ) = @_; 131 my ( $ok, $num, $desc ) = ( $1, $2, $3 ); 132 133 return $self->_make_test_token( 134 $line, $ok, $num, 135 $desc 136 ); 137 }, 138 }, 139 test => { 140 syntax => qr/^($ok) \s* ($num)? \s* (.*) \z/x, 141 handler => sub { 142 my ( $self, $line ) = @_; 143 my ( $ok, $num, $desc ) = ( $1, $2, $3 ); 144 my ( $dir, $explanation ) = ( '', '' ); 145 if ($desc =~ m/^ ( [^\\\#]* (?: \\. [^\\\#]* )* ) 146 \# \s* (SKIP|TODO) \b \s* (.*) $/ix 147 ) 148 { 149 ( $desc, $dir, $explanation ) = ( $1, $2, $3 ); 150 } 151 return $self->_make_test_token( 152 $line, $ok, $num, $desc, 153 $dir, $explanation 154 ); 155 }, 156 }, 157 comment => { 158 syntax => qr/^#(.*)/, 159 handler => sub { 160 my ( $self, $line ) = @_; 161 my $comment = $1; 162 return $self->_make_comment_token( $line, $comment ); 163 }, 164 }, 165 bailout => { 166 syntax => qr/^\s*Bail out!\s*(.*)/, 167 handler => sub { 168 my ( $self, $line ) = @_; 169 my $explanation = $1; 170 return $self->_make_bailout_token( 171 $line, 172 $explanation 173 ); 174 }, 175 }, 176 ); 177 178 my %v13 = ( 179 %v12, 180 plan => { 181 syntax => qr/^1\.\.(\d+)\s*(?:\s*#\s*SKIP\b(.*))?\z/i, 182 handler => sub { 183 my ( $self, $line ) = @_; 184 my ( $tests_planned, $explanation ) = ( $1, $2 ); 185 my $skip 186 = ( 0 == $tests_planned || defined $explanation ) 187 ? 'SKIP' 188 : ''; 189 $explanation = '' unless defined $explanation; 190 return $self->_make_plan_token( 191 $line, $tests_planned, $skip, 192 $explanation, [] 193 ); 194 }, 195 }, 196 yaml => { 197 syntax => qr/^ (\s+) (---.*) $/x, 198 handler => sub { 199 my ( $self, $line ) = @_; 200 my ( $pad, $marker ) = ( $1, $2 ); 201 return $self->_make_yaml_token( $pad, $marker ); 202 }, 203 }, 204 pragma => { 205 syntax => 206 qr/^ pragma \s+ ( [-+] \w+ \s* (?: , \s* [-+] \w+ \s* )* ) $/x, 207 handler => sub { 208 my ( $self, $line ) = @_; 209 my $pragmas = $1; 210 return $self->_make_pragma_token( $line, $pragmas ); 211 }, 212 }, 213 ); 214 215 %language_for = ( 216 '12' => { 217 tokens => \%v12, 218 }, 219 '13' => { 220 tokens => \%v13, 221 setup => sub { 222 shift->{iterator}->handle_unicode; 223 }, 224 }, 225 '14' => { 226 tokens => \%v13, 227 setup => sub { 228 shift->{iterator}->handle_unicode; 229 }, 230 }, 231 ); 232} 233 234############################################################################## 235 236=head2 Instance Methods 237 238=head3 C<set_version> 239 240 $grammar->set_version(13); 241 242Tell the grammar which TAP syntax version to support. The lowest 243supported version is 12. Although 'TAP version' isn't valid version 12 244syntax it is accepted so that higher version numbers may be parsed. 245 246=cut 247 248sub set_version { 249 my $self = shift; 250 my $version = shift; 251 252 if ( my $language = $language_for{$version} ) { 253 $self->{version} = $version; 254 $self->{tokens} = $language->{tokens}; 255 256 if ( my $setup = $language->{setup} ) { 257 $self->$setup(); 258 } 259 260 $self->_order_tokens; 261 } 262 else { 263 require Carp; 264 Carp::croak("Unsupported syntax version: $version"); 265 } 266} 267 268# Optimization to put the most frequent tokens first. 269sub _order_tokens { 270 my $self = shift; 271 272 my %copy = %{ $self->{tokens} }; 273 my @ordered_tokens = grep {defined} 274 map { delete $copy{$_} } qw( simple_test test comment plan ); 275 push @ordered_tokens, values %copy; 276 277 $self->{ordered_tokens} = \@ordered_tokens; 278} 279 280############################################################################## 281 282=head3 C<tokenize> 283 284 my $token = $grammar->tokenize; 285 286This method will return a L<TAP::Parser::Result> object representing the 287current line of TAP. 288 289=cut 290 291sub tokenize { 292 my $self = shift; 293 294 my $line = $self->{iterator}->next; 295 unless ( defined $line ) { 296 delete $self->{parser}; # break circular ref 297 return; 298 } 299 300 my $token; 301 302 for my $token_data ( @{ $self->{ordered_tokens} } ) { 303 if ( $line =~ $token_data->{syntax} ) { 304 my $handler = $token_data->{handler}; 305 $token = $self->$handler($line); 306 last; 307 } 308 } 309 310 $token = $self->_make_unknown_token($line) unless $token; 311 312 return $self->{parser}->make_result($token); 313} 314 315############################################################################## 316 317=head3 C<token_types> 318 319 my @types = $grammar->token_types; 320 321Returns the different types of tokens which this grammar can parse. 322 323=cut 324 325sub token_types { 326 my $self = shift; 327 return keys %{ $self->{tokens} }; 328} 329 330############################################################################## 331 332=head3 C<syntax_for> 333 334 my $syntax = $grammar->syntax_for($token_type); 335 336Returns a pre-compiled regular expression which will match a chunk of TAP 337corresponding to the token type. For example (not that you should really pay 338attention to this, C<< $grammar->syntax_for('comment') >> will return 339C<< qr/^#(.*)/ >>. 340 341=cut 342 343sub syntax_for { 344 my ( $self, $type ) = @_; 345 return $self->{tokens}->{$type}->{syntax}; 346} 347 348############################################################################## 349 350=head3 C<handler_for> 351 352 my $handler = $grammar->handler_for($token_type); 353 354Returns a code reference which, when passed an appropriate line of TAP, 355returns the lexed token corresponding to that line. As a result, the basic 356TAP parsing loop looks similar to the following: 357 358 my @tokens; 359 my $grammar = TAP::Grammar->new; 360 LINE: while ( defined( my $line = $parser->_next_chunk_of_tap ) ) { 361 for my $type ( $grammar->token_types ) { 362 my $syntax = $grammar->syntax_for($type); 363 if ( $line =~ $syntax ) { 364 my $handler = $grammar->handler_for($type); 365 push @tokens => $grammar->$handler($line); 366 next LINE; 367 } 368 } 369 push @tokens => $grammar->_make_unknown_token($line); 370 } 371 372=cut 373 374sub handler_for { 375 my ( $self, $type ) = @_; 376 return $self->{tokens}->{$type}->{handler}; 377} 378 379sub _make_version_token { 380 my ( $self, $line, $version ) = @_; 381 return { 382 type => 'version', 383 raw => $line, 384 version => $version, 385 }; 386} 387 388sub _make_plan_token { 389 my ( $self, $line, $tests_planned, $directive, $explanation, $todo ) = @_; 390 391 if ( $directive eq 'SKIP' 392 && 0 != $tests_planned 393 && $self->{version} < 13 ) 394 { 395 warn 396 "Specified SKIP directive in plan but more than 0 tests ($line)\n"; 397 } 398 399 return { 400 type => 'plan', 401 raw => $line, 402 tests_planned => $tests_planned, 403 directive => $directive, 404 explanation => _trim($explanation), 405 todo_list => $todo, 406 }; 407} 408 409sub _make_test_token { 410 my ( $self, $line, $ok, $num, $desc, $dir, $explanation ) = @_; 411 return { 412 ok => $ok, 413 414 # forcing this to be an integer (and not a string) reduces memory 415 # consumption. RT #84939 416 test_num => ( defined $num ? 0 + $num : undef ), 417 description => _trim($desc), 418 directive => ( defined $dir ? uc $dir : '' ), 419 explanation => _trim($explanation), 420 raw => $line, 421 type => 'test', 422 }; 423} 424 425sub _make_unknown_token { 426 my ( $self, $line ) = @_; 427 return { 428 raw => $line, 429 type => 'unknown', 430 }; 431} 432 433sub _make_comment_token { 434 my ( $self, $line, $comment ) = @_; 435 return { 436 type => 'comment', 437 raw => $line, 438 comment => _trim($comment) 439 }; 440} 441 442sub _make_bailout_token { 443 my ( $self, $line, $explanation ) = @_; 444 return { 445 type => 'bailout', 446 raw => $line, 447 bailout => _trim($explanation) 448 }; 449} 450 451sub _make_yaml_token { 452 my ( $self, $pad, $marker ) = @_; 453 454 my $yaml = TAP::Parser::YAMLish::Reader->new; 455 456 my $iterator = $self->{iterator}; 457 458 # Construct a reader that reads from our input stripping leading 459 # spaces from each line. 460 my $leader = length($pad); 461 my $strip = qr{ ^ (\s{$leader}) (.*) $ }x; 462 my @extra = ($marker); 463 my $reader = sub { 464 return shift @extra if @extra; 465 my $line = $iterator->next; 466 return $2 if $line =~ $strip; 467 return; 468 }; 469 470 my $data = $yaml->read($reader); 471 472 # Reconstitute input. This is convoluted. Maybe we should just 473 # record it on the way in... 474 chomp( my $raw = $yaml->get_raw ); 475 $raw =~ s/^/$pad/mg; 476 477 return { 478 type => 'yaml', 479 raw => $raw, 480 data => $data 481 }; 482} 483 484sub _make_pragma_token { 485 my ( $self, $line, $pragmas ) = @_; 486 return { 487 type => 'pragma', 488 raw => $line, 489 pragmas => [ split /\s*,\s*/, _trim($pragmas) ], 490 }; 491} 492 493sub _trim { 494 my $data = shift; 495 496 return '' unless defined $data; 497 498 $data =~ s/^\s+//; 499 $data =~ s/\s+$//; 500 return $data; 501} 502 5031; 504 505=head1 TAP GRAMMAR 506 507B<NOTE:> This grammar is slightly out of date. There's still some discussion 508about it and a new one will be provided when we have things better defined. 509 510The L<TAP::Parser> does not use a formal grammar because TAP is essentially a 511stream-based protocol. In fact, it's quite legal to have an infinite stream. 512For the same reason that we don't apply regexes to streams, we're not using a 513formal grammar here. Instead, we parse the TAP in lines. 514 515For purposes for forward compatibility, any result which does not match the 516following grammar is currently referred to as 517L<TAP::Parser::Result::Unknown>. It is I<not> a parse error. 518 519A formal grammar would look similar to the following: 520 521 (* 522 For the time being, I'm cheating on the EBNF by allowing 523 certain terms to be defined by POSIX character classes by 524 using the following syntax: 525 526 digit ::= [:digit:] 527 528 As far as I am aware, that's not valid EBNF. Sue me. I 529 didn't know how to write "char" otherwise (Unicode issues). 530 Suggestions welcome. 531 *) 532 533 tap ::= version? { comment | unknown } leading_plan lines 534 | 535 lines trailing_plan {comment} 536 537 version ::= 'TAP version ' positiveInteger {positiveInteger} "\n" 538 539 leading_plan ::= plan skip_directive? "\n" 540 541 trailing_plan ::= plan "\n" 542 543 plan ::= '1..' nonNegativeInteger 544 545 lines ::= line {line} 546 547 line ::= (comment | test | unknown | bailout ) "\n" 548 549 test ::= status positiveInteger? description? directive? 550 551 status ::= 'not '? 'ok ' 552 553 description ::= (character - (digit | '#')) {character - '#'} 554 555 directive ::= todo_directive | skip_directive 556 557 todo_directive ::= hash_mark 'TODO' ' ' {character} 558 559 skip_directive ::= hash_mark 'SKIP' ' ' {character} 560 561 comment ::= hash_mark {character} 562 563 hash_mark ::= '#' {' '} 564 565 bailout ::= 'Bail out!' {character} 566 567 unknown ::= { (character - "\n") } 568 569 (* POSIX character classes and other terminals *) 570 571 digit ::= [:digit:] 572 character ::= ([:print:] - "\n") 573 positiveInteger ::= ( digit - '0' ) {digit} 574 nonNegativeInteger ::= digit {digit} 575 576=head1 SUBCLASSING 577 578Please see L<TAP::Parser/SUBCLASSING> for a subclassing overview. 579 580If you I<really> want to subclass L<TAP::Parser>'s grammar the best thing to 581do is read through the code. There's no easy way of summarizing it here. 582 583=head1 SEE ALSO 584 585L<TAP::Object>, 586L<TAP::Parser>, 587L<TAP::Parser::Iterator>, 588L<TAP::Parser::Result>, 589 590=cut 591