1package TAP::Parser::Grammar; 2 3use strict; 4use warnings; 5 6use TAP::Parser::ResultFactory (); 7use TAP::Parser::YAMLish::Reader (); 8 9use base 'TAP::Object'; 10 11=head1 NAME 12 13TAP::Parser::Grammar - A grammar for the Test Anything Protocol. 14 15=head1 VERSION 16 17Version 3.44 18 19=cut 20 21our $VERSION = '3.44'; 22 23=head1 SYNOPSIS 24 25 use TAP::Parser::Grammar; 26 my $grammar = $self->make_grammar({ 27 iterator => $tap_parser_iterator, 28 parser => $tap_parser, 29 version => 12, 30 }); 31 32 my $result = $grammar->tokenize; 33 34=head1 DESCRIPTION 35 36C<TAP::Parser::Grammar> tokenizes lines from a L<TAP::Parser::Iterator> and 37constructs L<TAP::Parser::Result> subclasses to represent the tokens. 38 39Do not attempt to use this class directly. It won't make sense. It's mainly 40here to ensure that we will be able to have pluggable grammars when TAP is 41expanded at some future date (plus, this stuff was really cluttering the 42parser). 43 44=head1 METHODS 45 46=head2 Class Methods 47 48=head3 C<new> 49 50 my $grammar = TAP::Parser::Grammar->new({ 51 iterator => $iterator, 52 parser => $parser, 53 version => $version, 54 }); 55 56Returns L<TAP::Parser> grammar object that will parse the TAP stream from the 57specified iterator. Both C<iterator> and C<parser> are required arguments. 58If C<version> is not set it defaults to C<12> (see L</set_version> for more 59details). 60 61=cut 62 63# new() implementation supplied by TAP::Object 64sub _initialize { 65 my ( $self, $args ) = @_; 66 $self->{iterator} = $args->{iterator}; # TODO: accessor 67 $self->{iterator} ||= $args->{stream}; # deprecated 68 $self->{parser} = $args->{parser}; # TODO: accessor 69 $self->set_version( $args->{version} || 12 ); 70 return $self; 71} 72 73my %language_for; 74 75{ 76 77 # XXX the 'not' and 'ok' might be on separate lines in VMS ... 78 my $ok = qr/(?:not )?ok\b/; 79 my $num = qr/\d+/; 80 81 my %v12 = ( 82 version => { 83 syntax => qr/^TAP\s+version\s+(\d+)\s*\z/i, 84 handler => sub { 85 my ( $self, $line ) = @_; 86 my $version = $1; 87 return $self->_make_version_token( $line, $version, ); 88 }, 89 }, 90 plan => { 91 syntax => qr/^1\.\.(\d+)\s*(.*)\z/, 92 handler => sub { 93 my ( $self, $line ) = @_; 94 my ( $tests_planned, $tail ) = ( $1, $2 ); 95 my $explanation = undef; 96 my $skip = ''; 97 98 if ( $tail =~ /^todo((?:\s+\d+)+)/ ) { 99 my @todo = split /\s+/, _trim($1); 100 return $self->_make_plan_token( 101 $line, $tests_planned, 'TODO', 102 '', \@todo 103 ); 104 } 105 elsif ( 0 == $tests_planned ) { 106 $skip = 'SKIP'; 107 108 # If we can't match # SKIP the directive should be undef. 109 ($explanation) = $tail =~ /^#\s*SKIP\S*\s+(.*)/i; 110 } 111 elsif ( $tail !~ /^\s*$/ ) { 112 return $self->_make_unknown_token($line); 113 } 114 115 $explanation = '' unless defined $explanation; 116 117 return $self->_make_plan_token( 118 $line, $tests_planned, $skip, 119 $explanation, [] 120 ); 121 122 }, 123 }, 124 125 # An optimization to handle the most common test lines without 126 # directives. 127 simple_test => { 128 syntax => qr/^($ok) \ ($num) (?:\ ([^#]+))? \z/x, 129 handler => sub { 130 my ( $self, $line ) = @_; 131 my ( $ok, $num, $desc ) = ( $1, $2, $3 ); 132 133 return $self->_make_test_token( 134 $line, $ok, $num, 135 $desc 136 ); 137 }, 138 }, 139 test => { 140 syntax => qr/^($ok) \s* ($num)? \s* (.*) \z/x, 141 handler => sub { 142 my ( $self, $line ) = @_; 143 my ( $ok, $num, $desc ) = ( $1, $2, $3 ); 144 my ( $dir, $explanation ) = ( '', '' ); 145 if ($desc =~ m/^ ( [^\\\#]* (?: \\. [^\\\#]* )* ) 146 \# \s* (SKIP|TODO) \b \s* (.*) $/ix 147 ) 148 { 149 ( $desc, $dir, $explanation ) = ( $1, $2, $3 ); 150 } 151 return $self->_make_test_token( 152 $line, $ok, $num, $desc, 153 $dir, $explanation 154 ); 155 }, 156 }, 157 comment => { 158 syntax => qr/^#(.*)/, 159 handler => sub { 160 my ( $self, $line ) = @_; 161 my $comment = $1; 162 return $self->_make_comment_token( $line, $comment ); 163 }, 164 }, 165 bailout => { 166 syntax => qr/^\s*Bail out!\s*(.*)/, 167 handler => sub { 168 my ( $self, $line ) = @_; 169 my $explanation = $1; 170 return $self->_make_bailout_token( 171 $line, 172 $explanation 173 ); 174 }, 175 }, 176 ); 177 178 my %v13 = ( 179 %v12, 180 plan => { 181 syntax => qr/^1\.\.(\d+)\s*(?:\s*#\s*SKIP\b(.*))?\z/i, 182 handler => sub { 183 my ( $self, $line ) = @_; 184 my ( $tests_planned, $explanation ) = ( $1, $2 ); 185 my $skip 186 = ( 0 == $tests_planned || defined $explanation ) 187 ? 'SKIP' 188 : ''; 189 $explanation = '' unless defined $explanation; 190 return $self->_make_plan_token( 191 $line, $tests_planned, $skip, 192 $explanation, [] 193 ); 194 }, 195 }, 196 yaml => { 197 syntax => qr/^ (\s+) (---.*) $/x, 198 handler => sub { 199 my ( $self, $line ) = @_; 200 my ( $pad, $marker ) = ( $1, $2 ); 201 return $self->_make_yaml_token( $pad, $marker ); 202 }, 203 }, 204 pragma => { 205 syntax => 206 qr/^ pragma \s+ ( [-+] \w+ \s* (?: , \s* [-+] \w+ \s* )* ) $/x, 207 handler => sub { 208 my ( $self, $line ) = @_; 209 my $pragmas = $1; 210 return $self->_make_pragma_token( $line, $pragmas ); 211 }, 212 }, 213 ); 214 215 %language_for = ( 216 '12' => { 217 tokens => \%v12, 218 }, 219 '13' => { 220 tokens => \%v13, 221 setup => sub { 222 shift->{iterator}->handle_unicode; 223 }, 224 }, 225 ); 226} 227 228############################################################################## 229 230=head2 Instance Methods 231 232=head3 C<set_version> 233 234 $grammar->set_version(13); 235 236Tell the grammar which TAP syntax version to support. The lowest 237supported version is 12. Although 'TAP version' isn't valid version 12 238syntax it is accepted so that higher version numbers may be parsed. 239 240=cut 241 242sub set_version { 243 my $self = shift; 244 my $version = shift; 245 246 if ( my $language = $language_for{$version} ) { 247 $self->{version} = $version; 248 $self->{tokens} = $language->{tokens}; 249 250 if ( my $setup = $language->{setup} ) { 251 $self->$setup(); 252 } 253 254 $self->_order_tokens; 255 } 256 else { 257 require Carp; 258 Carp::croak("Unsupported syntax version: $version"); 259 } 260} 261 262# Optimization to put the most frequent tokens first. 263sub _order_tokens { 264 my $self = shift; 265 266 my %copy = %{ $self->{tokens} }; 267 my @ordered_tokens = grep {defined} 268 map { delete $copy{$_} } qw( simple_test test comment plan ); 269 push @ordered_tokens, values %copy; 270 271 $self->{ordered_tokens} = \@ordered_tokens; 272} 273 274############################################################################## 275 276=head3 C<tokenize> 277 278 my $token = $grammar->tokenize; 279 280This method will return a L<TAP::Parser::Result> object representing the 281current line of TAP. 282 283=cut 284 285sub tokenize { 286 my $self = shift; 287 288 my $line = $self->{iterator}->next; 289 unless ( defined $line ) { 290 delete $self->{parser}; # break circular ref 291 return; 292 } 293 294 my $token; 295 296 for my $token_data ( @{ $self->{ordered_tokens} } ) { 297 if ( $line =~ $token_data->{syntax} ) { 298 my $handler = $token_data->{handler}; 299 $token = $self->$handler($line); 300 last; 301 } 302 } 303 304 $token = $self->_make_unknown_token($line) unless $token; 305 306 return $self->{parser}->make_result($token); 307} 308 309############################################################################## 310 311=head3 C<token_types> 312 313 my @types = $grammar->token_types; 314 315Returns the different types of tokens which this grammar can parse. 316 317=cut 318 319sub token_types { 320 my $self = shift; 321 return keys %{ $self->{tokens} }; 322} 323 324############################################################################## 325 326=head3 C<syntax_for> 327 328 my $syntax = $grammar->syntax_for($token_type); 329 330Returns a pre-compiled regular expression which will match a chunk of TAP 331corresponding to the token type. For example (not that you should really pay 332attention to this, C<< $grammar->syntax_for('comment') >> will return 333C<< qr/^#(.*)/ >>. 334 335=cut 336 337sub syntax_for { 338 my ( $self, $type ) = @_; 339 return $self->{tokens}->{$type}->{syntax}; 340} 341 342############################################################################## 343 344=head3 C<handler_for> 345 346 my $handler = $grammar->handler_for($token_type); 347 348Returns a code reference which, when passed an appropriate line of TAP, 349returns the lexed token corresponding to that line. As a result, the basic 350TAP parsing loop looks similar to the following: 351 352 my @tokens; 353 my $grammar = TAP::Grammar->new; 354 LINE: while ( defined( my $line = $parser->_next_chunk_of_tap ) ) { 355 for my $type ( $grammar->token_types ) { 356 my $syntax = $grammar->syntax_for($type); 357 if ( $line =~ $syntax ) { 358 my $handler = $grammar->handler_for($type); 359 push @tokens => $grammar->$handler($line); 360 next LINE; 361 } 362 } 363 push @tokens => $grammar->_make_unknown_token($line); 364 } 365 366=cut 367 368sub handler_for { 369 my ( $self, $type ) = @_; 370 return $self->{tokens}->{$type}->{handler}; 371} 372 373sub _make_version_token { 374 my ( $self, $line, $version ) = @_; 375 return { 376 type => 'version', 377 raw => $line, 378 version => $version, 379 }; 380} 381 382sub _make_plan_token { 383 my ( $self, $line, $tests_planned, $directive, $explanation, $todo ) = @_; 384 385 if ( $directive eq 'SKIP' 386 && 0 != $tests_planned 387 && $self->{version} < 13 ) 388 { 389 warn 390 "Specified SKIP directive in plan but more than 0 tests ($line)\n"; 391 } 392 393 return { 394 type => 'plan', 395 raw => $line, 396 tests_planned => $tests_planned, 397 directive => $directive, 398 explanation => _trim($explanation), 399 todo_list => $todo, 400 }; 401} 402 403sub _make_test_token { 404 my ( $self, $line, $ok, $num, $desc, $dir, $explanation ) = @_; 405 return { 406 ok => $ok, 407 408 # forcing this to be an integer (and not a string) reduces memory 409 # consumption. RT #84939 410 test_num => ( defined $num ? 0 + $num : undef ), 411 description => _trim($desc), 412 directive => ( defined $dir ? uc $dir : '' ), 413 explanation => _trim($explanation), 414 raw => $line, 415 type => 'test', 416 }; 417} 418 419sub _make_unknown_token { 420 my ( $self, $line ) = @_; 421 return { 422 raw => $line, 423 type => 'unknown', 424 }; 425} 426 427sub _make_comment_token { 428 my ( $self, $line, $comment ) = @_; 429 return { 430 type => 'comment', 431 raw => $line, 432 comment => _trim($comment) 433 }; 434} 435 436sub _make_bailout_token { 437 my ( $self, $line, $explanation ) = @_; 438 return { 439 type => 'bailout', 440 raw => $line, 441 bailout => _trim($explanation) 442 }; 443} 444 445sub _make_yaml_token { 446 my ( $self, $pad, $marker ) = @_; 447 448 my $yaml = TAP::Parser::YAMLish::Reader->new; 449 450 my $iterator = $self->{iterator}; 451 452 # Construct a reader that reads from our input stripping leading 453 # spaces from each line. 454 my $leader = length($pad); 455 my $strip = qr{ ^ (\s{$leader}) (.*) $ }x; 456 my @extra = ($marker); 457 my $reader = sub { 458 return shift @extra if @extra; 459 my $line = $iterator->next; 460 return $2 if $line =~ $strip; 461 return; 462 }; 463 464 my $data = $yaml->read($reader); 465 466 # Reconstitute input. This is convoluted. Maybe we should just 467 # record it on the way in... 468 chomp( my $raw = $yaml->get_raw ); 469 $raw =~ s/^/$pad/mg; 470 471 return { 472 type => 'yaml', 473 raw => $raw, 474 data => $data 475 }; 476} 477 478sub _make_pragma_token { 479 my ( $self, $line, $pragmas ) = @_; 480 return { 481 type => 'pragma', 482 raw => $line, 483 pragmas => [ split /\s*,\s*/, _trim($pragmas) ], 484 }; 485} 486 487sub _trim { 488 my $data = shift; 489 490 return '' unless defined $data; 491 492 $data =~ s/^\s+//; 493 $data =~ s/\s+$//; 494 return $data; 495} 496 4971; 498 499=head1 TAP GRAMMAR 500 501B<NOTE:> This grammar is slightly out of date. There's still some discussion 502about it and a new one will be provided when we have things better defined. 503 504The L<TAP::Parser> does not use a formal grammar because TAP is essentially a 505stream-based protocol. In fact, it's quite legal to have an infinite stream. 506For the same reason that we don't apply regexes to streams, we're not using a 507formal grammar here. Instead, we parse the TAP in lines. 508 509For purposes for forward compatibility, any result which does not match the 510following grammar is currently referred to as 511L<TAP::Parser::Result::Unknown>. It is I<not> a parse error. 512 513A formal grammar would look similar to the following: 514 515 (* 516 For the time being, I'm cheating on the EBNF by allowing 517 certain terms to be defined by POSIX character classes by 518 using the following syntax: 519 520 digit ::= [:digit:] 521 522 As far as I am aware, that's not valid EBNF. Sue me. I 523 didn't know how to write "char" otherwise (Unicode issues). 524 Suggestions welcome. 525 *) 526 527 tap ::= version? { comment | unknown } leading_plan lines 528 | 529 lines trailing_plan {comment} 530 531 version ::= 'TAP version ' positiveInteger {positiveInteger} "\n" 532 533 leading_plan ::= plan skip_directive? "\n" 534 535 trailing_plan ::= plan "\n" 536 537 plan ::= '1..' nonNegativeInteger 538 539 lines ::= line {line} 540 541 line ::= (comment | test | unknown | bailout ) "\n" 542 543 test ::= status positiveInteger? description? directive? 544 545 status ::= 'not '? 'ok ' 546 547 description ::= (character - (digit | '#')) {character - '#'} 548 549 directive ::= todo_directive | skip_directive 550 551 todo_directive ::= hash_mark 'TODO' ' ' {character} 552 553 skip_directive ::= hash_mark 'SKIP' ' ' {character} 554 555 comment ::= hash_mark {character} 556 557 hash_mark ::= '#' {' '} 558 559 bailout ::= 'Bail out!' {character} 560 561 unknown ::= { (character - "\n") } 562 563 (* POSIX character classes and other terminals *) 564 565 digit ::= [:digit:] 566 character ::= ([:print:] - "\n") 567 positiveInteger ::= ( digit - '0' ) {digit} 568 nonNegativeInteger ::= digit {digit} 569 570=head1 SUBCLASSING 571 572Please see L<TAP::Parser/SUBCLASSING> for a subclassing overview. 573 574If you I<really> want to subclass L<TAP::Parser>'s grammar the best thing to 575do is read through the code. There's no easy way of summarizing it here. 576 577=head1 SEE ALSO 578 579L<TAP::Object>, 580L<TAP::Parser>, 581L<TAP::Parser::Iterator>, 582L<TAP::Parser::Result>, 583 584=cut 585