1package Pod::Simple::BlackBox; 2# 3# "What's in the box?" "Pain." 4# 5########################################################################### 6# 7# This is where all the scary things happen: parsing lines into 8# paragraphs; and then into directives, verbatims, and then also 9# turning formatting sequences into treelets. 10# 11# Are you really sure you want to read this code? 12# 13#----------------------------------------------------------------------------- 14# 15# The basic work of this module Pod::Simple::BlackBox is doing the dirty work 16# of parsing Pod into treelets (generally one per non-verbatim paragraph), and 17# to call the proper callbacks on the treelets. 18# 19# Every node in a treelet is a ['name', {attrhash}, ...children...] 20 21use integer; # vroom! 22use strict; 23use Carp (); 24use vars qw($VERSION ); 25$VERSION = '3.35'; 26#use constant DEBUG => 7; 27BEGIN { 28 require Pod::Simple; 29 *DEBUG = \&Pod::Simple::DEBUG unless defined &DEBUG 30} 31 32# Matches a character iff the character will have a different meaning 33# if we choose CP1252 vs UTF-8 if there is no =encoding line. 34# This is broken for early Perls on non-ASCII platforms. 35my $non_ascii_re = eval "qr/[[:^ascii:]]/"; 36$non_ascii_re = qr/[\x80-\xFF]/ if ! defined $non_ascii_re; 37 38my $utf8_bom; 39if (($] ge 5.007_003)) { 40 $utf8_bom = "\x{FEFF}"; 41 utf8::encode($utf8_bom); 42} else { 43 $utf8_bom = "\xEF\xBB\xBF"; # No EBCDIC BOM detection for early Perls. 44} 45 46#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 47 48sub parse_line { shift->parse_lines(@_) } # alias 49 50# - - - Turn back now! Run away! - - - 51 52sub parse_lines { # Usage: $parser->parse_lines(@lines) 53 # an undef means end-of-stream 54 my $self = shift; 55 56 my $code_handler = $self->{'code_handler'}; 57 my $cut_handler = $self->{'cut_handler'}; 58 my $wl_handler = $self->{'whiteline_handler'}; 59 $self->{'line_count'} ||= 0; 60 61 my $scratch; 62 63 DEBUG > 4 and 64 print STDERR "# Parsing starting at line ", $self->{'line_count'}, ".\n"; 65 66 DEBUG > 5 and 67 print STDERR "# About to parse lines: ", 68 join(' ', map defined($_) ? "[$_]" : "EOF", @_), "\n"; 69 70 my $paras = ($self->{'paras'} ||= []); 71 # paragraph buffer. Because we need to defer processing of =over 72 # directives and verbatim paragraphs. We call _ponder_paragraph_buffer 73 # to process this. 74 75 $self->{'pod_para_count'} ||= 0; 76 77 my $line; 78 foreach my $source_line (@_) { 79 if( $self->{'source_dead'} ) { 80 DEBUG > 4 and print STDERR "# Source is dead.\n"; 81 last; 82 } 83 84 unless( defined $source_line ) { 85 DEBUG > 4 and print STDERR "# Undef-line seen.\n"; 86 87 push @$paras, ['~end', {'start_line' => $self->{'line_count'}}]; 88 push @$paras, $paras->[-1], $paras->[-1]; 89 # So that it definitely fills the buffer. 90 $self->{'source_dead'} = 1; 91 $self->_ponder_paragraph_buffer; 92 next; 93 } 94 95 96 if( $self->{'line_count'}++ ) { 97 ($line = $source_line) =~ tr/\n\r//d; 98 # If we don't have two vars, we'll end up with that there 99 # tr/// modding the (potentially read-only) original source line! 100 101 } else { 102 DEBUG > 2 and print STDERR "First line: [$source_line]\n"; 103 104 if( ($line = $source_line) =~ s/^$utf8_bom//s ) { 105 DEBUG and print STDERR "UTF-8 BOM seen. Faking a '=encoding utf8'.\n"; 106 $self->_handle_encoding_line( "=encoding utf8" ); 107 delete $self->{'_processed_encoding'}; 108 $line =~ tr/\n\r//d; 109 110 } elsif( $line =~ s/^\xFE\xFF//s ) { 111 DEBUG and print STDERR "Big-endian UTF-16 BOM seen. Aborting parsing.\n"; 112 $self->scream( 113 $self->{'line_count'}, 114 "UTF16-BE Byte Encoding Mark found; but Pod::Simple v$Pod::Simple::VERSION doesn't implement UTF16 yet." 115 ); 116 splice @_; 117 push @_, undef; 118 next; 119 120 # TODO: implement somehow? 121 122 } elsif( $line =~ s/^\xFF\xFE//s ) { 123 DEBUG and print STDERR "Little-endian UTF-16 BOM seen. Aborting parsing.\n"; 124 $self->scream( 125 $self->{'line_count'}, 126 "UTF16-LE Byte Encoding Mark found; but Pod::Simple v$Pod::Simple::VERSION doesn't implement UTF16 yet." 127 ); 128 splice @_; 129 push @_, undef; 130 next; 131 132 # TODO: implement somehow? 133 134 } else { 135 DEBUG > 2 and print STDERR "First line is BOM-less.\n"; 136 ($line = $source_line) =~ tr/\n\r//d; 137 } 138 } 139 140 if(!$self->{'parse_characters'} && !$self->{'encoding'} 141 && ($self->{'in_pod'} || $line =~ /^=/s) 142 && $line =~ /$non_ascii_re/ 143 ) { 144 145 my $encoding; 146 147 # No =encoding line, and we are at the first line in the input that 148 # contains a non-ascii byte, that is one whose meaning varies depending 149 # on whether the file is encoded in UTF-8 or CP1252, which are the two 150 # possibilities permitted by the pod spec. (ASCII is assumed if the 151 # file only contains ASCII bytes.) In order to process this line, we 152 # need to figure out what encoding we will use for the file. 153 # 154 # Strictly speaking ISO 8859-1 (Latin 1) refers to the code points 155 # 160-255, but it is used here, as it often colloquially is, to refer to 156 # the complete set of code points 0-255, including ASCII (0-127), the C1 157 # controls (128-159), and strict Latin 1 (160-255). 158 # 159 # CP1252 is effectively a superset of Latin 1, because it differs only 160 # from colloquial 8859-1 in the C1 controls, which are very unlikely to 161 # actually be present in 8859-1 files, so can be used for other purposes 162 # without conflict. CP 1252 uses most of them for graphic characters. 163 # 164 # Note that all ASCII-range bytes represent their corresponding code 165 # points in CP1252 and UTF-8. In ASCII platform UTF-8 all other code 166 # points require multiple (non-ASCII) bytes to represent. (A separate 167 # paragraph for EBCDIC is below.) The multi-byte representation is 168 # quite structured. If we find an isolated byte that requires multiple 169 # bytes to represent in UTF-8, we know that the encoding is not UTF-8. 170 # If we find a sequence of bytes that violates the UTF-8 structure, we 171 # also can presume the encoding isn't UTF-8, and hence must be 1252. 172 # 173 # But there are ambiguous cases where we could guess wrong. If so, the 174 # user will end up having to supply an =encoding line. We use all 175 # readily available information to improve our chances of guessing 176 # right. The odds of something not being UTF-8, but still passing a 177 # UTF-8 validity test go down very rapidly with increasing length of the 178 # sequence. Therefore we look at all the maximal length non-ascii 179 # sequences on the line. If any of the sequences can't be UTF-8, we 180 # quit there and choose CP1252. If all could be UTF-8, we guess UTF-8. 181 # 182 # On EBCDIC platforms, the situation is somewhat different. In 183 # UTF-EBCDIC, not only do ASCII-range bytes represent their code points, 184 # but so do the bytes that are for the C1 controls. Recall that these 185 # correspond to the unused portion of 8859-1 that 1252 mostly takes 186 # over. That means that there are fewer code points that are 187 # represented by multi-bytes. But, note that the these controls are 188 # very unlikely to be in pod text. So if we encounter one of them, it 189 # means that it is quite likely CP1252 and not UTF-8. The net result is 190 # the same code below is used for both platforms. 191 while ($line =~ m/($non_ascii_re+)/g) { 192 my $non_ascii_seq = $1; 193 194 if (length $non_ascii_seq == 1) { 195 $encoding = 'CP1252'; 196 goto guessed; 197 } elsif ($] ge 5.007_003) { 198 199 # On Perls that have this function, we can see if the sequence is 200 # valid UTF-8 or not. 201 my $is_utf8; 202 { 203 no warnings 'utf8'; 204 $is_utf8 = utf8::decode($non_ascii_seq); 205 } 206 if (! $is_utf8) { 207 $encoding = 'CP1252'; 208 goto guessed; 209 } 210 } elsif (ord("A") == 65) { # An early Perl, ASCII platform 211 212 # Without utf8::decode, it's a lot harder to do a rigorous check 213 # (though some early releases had a different function that 214 # accomplished the same thing). Since these are ancient Perls, not 215 # likely to be in use today, we take the easy way out, and look at 216 # just the first two bytes of the sequence to see if they are the 217 # start of a UTF-8 character. In ASCII UTF-8, continuation bytes 218 # must be between 0x80 and 0xBF. Start bytes can range from 0xC2 219 # through 0xFF, but anything above 0xF4 is not Unicode, and hence 220 # extremely unlikely to be in a pod. 221 if ($non_ascii_seq !~ /^[\xC2-\xF4][\x80-\xBF]/) { 222 $encoding = 'CP1252'; 223 goto guessed; 224 } 225 226 # We don't bother doing anything special for EBCDIC on early Perls. 227 # If there is a solitary variant, CP1252 will be chosen; otherwise 228 # UTF-8. 229 } 230 } # End of loop through all variant sequences on the line 231 232 # All sequences in the line could be UTF-8. Guess that. 233 $encoding = 'UTF-8'; 234 235 guessed: 236 $self->_handle_encoding_line( "=encoding $encoding" ); 237 delete $self->{'_processed_encoding'}; 238 $self->{'_transcoder'} && $self->{'_transcoder'}->($line); 239 240 my ($word) = $line =~ /(\S*$non_ascii_re\S*)/; 241 242 $self->whine( 243 $self->{'line_count'}, 244 "Non-ASCII character seen before =encoding in '$word'. Assuming $encoding" 245 ); 246 } 247 248 DEBUG > 5 and print STDERR "# Parsing line: [$line]\n"; 249 250 if(!$self->{'in_pod'}) { 251 if($line =~ m/^=([a-zA-Z][a-zA-Z0-9]*)(?:\s|$)/s) { 252 if($1 eq 'cut') { 253 $self->scream( 254 $self->{'line_count'}, 255 "=cut found outside a pod block. Skipping to next block." 256 ); 257 258 ## Before there were errata sections in the world, it was 259 ## least-pessimal to abort processing the file. But now we can 260 ## just barrel on thru (but still not start a pod block). 261 #splice @_; 262 #push @_, undef; 263 264 next; 265 } else { 266 $self->{'in_pod'} = $self->{'start_of_pod_block'} 267 = $self->{'last_was_blank'} = 1; 268 # And fall thru to the pod-mode block further down 269 } 270 } else { 271 DEBUG > 5 and print STDERR "# It's a code-line.\n"; 272 $code_handler->(map $_, $line, $self->{'line_count'}, $self) 273 if $code_handler; 274 # Note: this may cause code to be processed out of order relative 275 # to pods, but in order relative to cuts. 276 277 # Note also that we haven't yet applied the transcoding to $line 278 # by time we call $code_handler! 279 280 if( $line =~ m/^#\s*line\s+(\d+)\s*(?:\s"([^"]+)")?\s*$/ ) { 281 # That RE is from perlsyn, section "Plain Old Comments (Not!)", 282 #$fname = $2 if defined $2; 283 #DEBUG > 1 and defined $2 and print STDERR "# Setting fname to \"$fname\"\n"; 284 DEBUG > 1 and print STDERR "# Setting nextline to $1\n"; 285 $self->{'line_count'} = $1 - 1; 286 } 287 288 next; 289 } 290 } 291 292 # . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 293 # Else we're in pod mode: 294 295 # Apply any necessary transcoding: 296 $self->{'_transcoder'} && $self->{'_transcoder'}->($line); 297 298 # HERE WE CATCH =encoding EARLY! 299 if( $line =~ m/^=encoding\s+\S+\s*$/s ) { 300 next if $self->parse_characters; # Ignore this line 301 $line = $self->_handle_encoding_line( $line ); 302 } 303 304 if($line =~ m/^=cut/s) { 305 # here ends the pod block, and therefore the previous pod para 306 DEBUG > 1 and print STDERR "Noting =cut at line ${$self}{'line_count'}\n"; 307 $self->{'in_pod'} = 0; 308 # ++$self->{'pod_para_count'}; 309 $self->_ponder_paragraph_buffer(); 310 # by now it's safe to consider the previous paragraph as done. 311 $cut_handler->(map $_, $line, $self->{'line_count'}, $self) 312 if $cut_handler; 313 314 # TODO: add to docs: Note: this may cause cuts to be processed out 315 # of order relative to pods, but in order relative to code. 316 317 } elsif($line =~ m/^(\s*)$/s) { # it's a blank line 318 if (defined $1 and $1 =~ /[^\S\r\n]/) { # it's a white line 319 $wl_handler->(map $_, $line, $self->{'line_count'}, $self) 320 if $wl_handler; 321 } 322 323 if(!$self->{'start_of_pod_block'} and @$paras and $paras->[-1][0] eq '~Verbatim') { 324 DEBUG > 1 and print STDERR "Saving blank line at line ${$self}{'line_count'}\n"; 325 push @{$paras->[-1]}, $line; 326 } # otherwise it's not interesting 327 328 if(!$self->{'start_of_pod_block'} and !$self->{'last_was_blank'}) { 329 DEBUG > 1 and print STDERR "Noting para ends with blank line at ${$self}{'line_count'}\n"; 330 } 331 332 $self->{'last_was_blank'} = 1; 333 334 } elsif($self->{'last_was_blank'}) { # A non-blank line starting a new para... 335 336 if($line =~ m/^(=[a-zA-Z][a-zA-Z0-9]*)(?:\s+|$)(.*)/s) { 337 # THIS IS THE ONE PLACE WHERE WE CONSTRUCT NEW DIRECTIVE OBJECTS 338 my $new = [$1, {'start_line' => $self->{'line_count'}}, $2]; 339 # Note that in "=head1 foo", the WS is lost. 340 # Example: ['=head1', {'start_line' => 123}, ' foo'] 341 342 ++$self->{'pod_para_count'}; 343 344 $self->_ponder_paragraph_buffer(); 345 # by now it's safe to consider the previous paragraph as done. 346 347 push @$paras, $new; # the new incipient paragraph 348 DEBUG > 1 and print STDERR "Starting new ${$paras}[-1][0] para at line ${$self}{'line_count'}\n"; 349 350 } elsif($line =~ m/^\s/s) { 351 352 if(!$self->{'start_of_pod_block'} and @$paras and $paras->[-1][0] eq '~Verbatim') { 353 DEBUG > 1 and print STDERR "Resuming verbatim para at line ${$self}{'line_count'}\n"; 354 push @{$paras->[-1]}, $line; 355 } else { 356 ++$self->{'pod_para_count'}; 357 $self->_ponder_paragraph_buffer(); 358 # by now it's safe to consider the previous paragraph as done. 359 DEBUG > 1 and print STDERR "Starting verbatim para at line ${$self}{'line_count'}\n"; 360 push @$paras, ['~Verbatim', {'start_line' => $self->{'line_count'}}, $line]; 361 } 362 } else { 363 ++$self->{'pod_para_count'}; 364 $self->_ponder_paragraph_buffer(); 365 # by now it's safe to consider the previous paragraph as done. 366 push @$paras, ['~Para', {'start_line' => $self->{'line_count'}}, $line]; 367 DEBUG > 1 and print STDERR "Starting plain para at line ${$self}{'line_count'}\n"; 368 } 369 $self->{'last_was_blank'} = $self->{'start_of_pod_block'} = 0; 370 371 } else { 372 # It's a non-blank line /continuing/ the current para 373 if(@$paras) { 374 DEBUG > 2 and print STDERR "Line ${$self}{'line_count'} continues current paragraph\n"; 375 push @{$paras->[-1]}, $line; 376 } else { 377 # Unexpected case! 378 die "Continuing a paragraph but \@\$paras is empty?"; 379 } 380 $self->{'last_was_blank'} = $self->{'start_of_pod_block'} = 0; 381 } 382 383 } # ends the big while loop 384 385 DEBUG > 1 and print STDERR (pretty(@$paras), "\n"); 386 return $self; 387} 388 389#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 390 391sub _handle_encoding_line { 392 my($self, $line) = @_; 393 394 return if $self->parse_characters; 395 396 # The point of this routine is to set $self->{'_transcoder'} as indicated. 397 398 return $line unless $line =~ m/^=encoding\s+(\S+)\s*$/s; 399 DEBUG > 1 and print STDERR "Found an encoding line \"=encoding $1\"\n"; 400 401 my $e = $1; 402 my $orig = $e; 403 push @{ $self->{'encoding_command_reqs'} }, "=encoding $orig"; 404 405 my $enc_error; 406 407 # Cf. perldoc Encode and perldoc Encode::Supported 408 409 require Pod::Simple::Transcode; 410 411 if( $self->{'encoding'} ) { 412 my $norm_current = $self->{'encoding'}; 413 my $norm_e = $e; 414 foreach my $that ($norm_current, $norm_e) { 415 $that = lc($that); 416 $that =~ s/[-_]//g; 417 } 418 if($norm_current eq $norm_e) { 419 DEBUG > 1 and print STDERR "The '=encoding $orig' line is ", 420 "redundant. ($norm_current eq $norm_e). Ignoring.\n"; 421 $enc_error = ''; 422 # But that doesn't necessarily mean that the earlier one went okay 423 } else { 424 $enc_error = "Encoding is already set to " . $self->{'encoding'}; 425 DEBUG > 1 and print STDERR $enc_error; 426 } 427 } elsif ( 428 # OK, let's turn on the encoding 429 do { 430 DEBUG > 1 and print STDERR " Setting encoding to $e\n"; 431 $self->{'encoding'} = $e; 432 1; 433 } 434 and $e eq 'HACKRAW' 435 ) { 436 DEBUG and print STDERR " Putting in HACKRAW (no-op) encoding mode.\n"; 437 438 } elsif( Pod::Simple::Transcode::->encoding_is_available($e) ) { 439 440 die($enc_error = "WHAT? _transcoder is already set?!") 441 if $self->{'_transcoder'}; # should never happen 442 require Pod::Simple::Transcode; 443 $self->{'_transcoder'} = Pod::Simple::Transcode::->make_transcoder($e); 444 eval { 445 my @x = ('', "abc", "123"); 446 $self->{'_transcoder'}->(@x); 447 }; 448 $@ && die( $enc_error = 449 "Really unexpected error setting up encoding $e: $@\nAborting" 450 ); 451 $self->{'detected_encoding'} = $e; 452 453 } else { 454 my @supported = Pod::Simple::Transcode::->all_encodings; 455 456 # Note unsupported, and complain 457 DEBUG and print STDERR " Encoding [$e] is unsupported.", 458 "\nSupporteds: @supported\n"; 459 my $suggestion = ''; 460 461 # Look for a near match: 462 my $norm = lc($e); 463 $norm =~ tr[-_][]d; 464 my $n; 465 foreach my $enc (@supported) { 466 $n = lc($enc); 467 $n =~ tr[-_][]d; 468 next unless $n eq $norm; 469 $suggestion = " (Maybe \"$e\" should be \"$enc\"?)"; 470 last; 471 } 472 my $encmodver = Pod::Simple::Transcode::->encmodver; 473 $enc_error = join '' => 474 "This document probably does not appear as it should, because its ", 475 "\"=encoding $e\" line calls for an unsupported encoding.", 476 $suggestion, " [$encmodver\'s supported encodings are: @supported]" 477 ; 478 479 $self->scream( $self->{'line_count'}, $enc_error ); 480 } 481 push @{ $self->{'encoding_command_statuses'} }, $enc_error; 482 if (defined($self->{'_processed_encoding'})) { 483 # Double declaration. 484 $self->scream( $self->{'line_count'}, 'Cannot have multiple =encoding directives'); 485 } 486 $self->{'_processed_encoding'} = $orig; 487 488 return $line; 489} 490 491# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 492 493sub _handle_encoding_second_level { 494 # By time this is called, the encoding (if well formed) will already 495 # have been acted one. 496 my($self, $para) = @_; 497 my @x = @$para; 498 my $content = join ' ', splice @x, 2; 499 $content =~ s/^\s+//s; 500 $content =~ s/\s+$//s; 501 502 DEBUG > 2 and print STDERR "Ogling encoding directive: =encoding $content\n"; 503 504 if (defined($self->{'_processed_encoding'})) { 505 #if($content ne $self->{'_processed_encoding'}) { 506 # Could it happen? 507 #} 508 delete $self->{'_processed_encoding'}; 509 # It's already been handled. Check for errors. 510 if(! $self->{'encoding_command_statuses'} ) { 511 DEBUG > 2 and print STDERR " CRAZY ERROR: It wasn't really handled?!\n"; 512 } elsif( $self->{'encoding_command_statuses'}[-1] ) { 513 $self->whine( $para->[1]{'start_line'}, 514 sprintf "Couldn't do %s: %s", 515 $self->{'encoding_command_reqs' }[-1], 516 $self->{'encoding_command_statuses'}[-1], 517 ); 518 } else { 519 DEBUG > 2 and print STDERR " (Yup, it was successfully handled already.)\n"; 520 } 521 522 } else { 523 # Otherwise it's a syntax error 524 $self->whine( $para->[1]{'start_line'}, 525 "Invalid =encoding syntax: $content" 526 ); 527 } 528 529 return; 530} 531 532#~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~` 533 534{ 535my $m = -321; # magic line number 536 537sub _gen_errata { 538 my $self = $_[0]; 539 # Return 0 or more fake-o paragraphs explaining the accumulated 540 # errors on this document. 541 542 return() unless $self->{'errata'} and keys %{$self->{'errata'}}; 543 544 my @out; 545 546 foreach my $line (sort {$a <=> $b} keys %{$self->{'errata'}}) { 547 push @out, 548 ['=item', {'start_line' => $m}, "Around line $line:"], 549 map( ['~Para', {'start_line' => $m, '~cooked' => 1}, 550 #['~Top', {'start_line' => $m}, 551 $_ 552 #] 553 ], 554 @{$self->{'errata'}{$line}} 555 ) 556 ; 557 } 558 559 # TODO: report of unknown entities? unrenderable characters? 560 561 unshift @out, 562 ['=head1', {'start_line' => $m, 'errata' => 1}, 'POD ERRORS'], 563 ['~Para', {'start_line' => $m, '~cooked' => 1, 'errata' => 1}, 564 "Hey! ", 565 ['B', {}, 566 'The above document had some coding errors, which are explained below:' 567 ] 568 ], 569 ['=over', {'start_line' => $m, 'errata' => 1}, ''], 570 ; 571 572 push @out, 573 ['=back', {'start_line' => $m, 'errata' => 1}, ''], 574 ; 575 576 DEBUG and print STDERR "\n<<\n", pretty(\@out), "\n>>\n\n"; 577 578 return @out; 579} 580 581} 582 583#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 584 585############################################################################## 586## 587## stop reading now stop reading now stop reading now stop reading now stop 588## 589## HERE IT BECOMES REALLY SCARY 590## 591## stop reading now stop reading now stop reading now stop reading now stop 592## 593############################################################################## 594 595sub _ponder_paragraph_buffer { 596 597 # Para-token types as found in the buffer. 598 # ~Verbatim, ~Para, ~end, =head1..4, =for, =begin, =end, 599 # =over, =back, =item 600 # and the null =pod (to be complained about if over one line) 601 # 602 # "~data" paragraphs are something we generate at this level, depending on 603 # a currently open =over region 604 605 # Events fired: Begin and end for: 606 # directivename (like head1 .. head4), item, extend, 607 # for (from =begin...=end, =for), 608 # over-bullet, over-number, over-text, over-block, 609 # item-bullet, item-number, item-text, 610 # Document, 611 # Data, Para, Verbatim 612 # B, C, longdirname (TODO -- wha?), etc. for all directives 613 # 614 615 my $self = $_[0]; 616 my $paras; 617 return unless @{$paras = $self->{'paras'}}; 618 my $curr_open = ($self->{'curr_open'} ||= []); 619 620 my $scratch; 621 622 DEBUG > 10 and print STDERR "# Paragraph buffer: <<", pretty($paras), ">>\n"; 623 624 # We have something in our buffer. So apparently the document has started. 625 unless($self->{'doc_has_started'}) { 626 $self->{'doc_has_started'} = 1; 627 628 my $starting_contentless; 629 $starting_contentless = 630 ( 631 !@$curr_open 632 and @$paras and ! grep $_->[0] ne '~end', @$paras 633 # i.e., if the paras is all ~ends 634 ) 635 ; 636 DEBUG and print STDERR "# Starting ", 637 $starting_contentless ? 'contentless' : 'contentful', 638 " document\n" 639 ; 640 641 $self->_handle_element_start( 642 ($scratch = 'Document'), 643 { 644 'start_line' => $paras->[0][1]{'start_line'}, 645 $starting_contentless ? ( 'contentless' => 1 ) : (), 646 }, 647 ); 648 } 649 650 my($para, $para_type); 651 while(@$paras) { 652 last if @$paras == 1 and 653 ( $paras->[0][0] eq '=over' or $paras->[0][0] eq '~Verbatim' 654 or $paras->[0][0] eq '=item' ) 655 ; 656 # Those're the three kinds of paragraphs that require lookahead. 657 # Actually, an "=item Foo" inside an <over type=text> region 658 # and any =item inside an <over type=block> region (rare) 659 # don't require any lookahead, but all others (bullets 660 # and numbers) do. 661 662# TODO: whinge about many kinds of directives in non-resolving =for regions? 663# TODO: many? like what? =head1 etc? 664 665 $para = shift @$paras; 666 $para_type = $para->[0]; 667 668 DEBUG > 1 and print STDERR "Pondering a $para_type paragraph, given the stack: (", 669 $self->_dump_curr_open(), ")\n"; 670 671 if($para_type eq '=for') { 672 next if $self->_ponder_for($para,$curr_open,$paras); 673 674 } elsif($para_type eq '=begin') { 675 next if $self->_ponder_begin($para,$curr_open,$paras); 676 677 } elsif($para_type eq '=end') { 678 next if $self->_ponder_end($para,$curr_open,$paras); 679 680 } elsif($para_type eq '~end') { # The virtual end-document signal 681 next if $self->_ponder_doc_end($para,$curr_open,$paras); 682 } 683 684 685 # ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ 686 #~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ 687 if(grep $_->[1]{'~ignore'}, @$curr_open) { 688 DEBUG > 1 and 689 print STDERR "Skipping $para_type paragraph because in ignore mode.\n"; 690 next; 691 } 692 #~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ 693 # ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ 694 695 if($para_type eq '=pod') { 696 $self->_ponder_pod($para,$curr_open,$paras); 697 698 } elsif($para_type eq '=over') { 699 next if $self->_ponder_over($para,$curr_open,$paras); 700 701 } elsif($para_type eq '=back') { 702 next if $self->_ponder_back($para,$curr_open,$paras); 703 704 } else { 705 706 # All non-magical codes!!! 707 708 # Here we start using $para_type for our own twisted purposes, to 709 # mean how it should get treated, not as what the element name 710 # should be. 711 712 DEBUG > 1 and print STDERR "Pondering non-magical $para_type\n"; 713 714 my $i; 715 716 # Enforce some =headN discipline 717 if($para_type =~ m/^=head\d$/s 718 and ! $self->{'accept_heads_anywhere'} 719 and @$curr_open 720 and $curr_open->[-1][0] eq '=over' 721 ) { 722 DEBUG > 2 and print STDERR "'=$para_type' inside an '=over'!\n"; 723 $self->whine( 724 $para->[1]{'start_line'}, 725 "You forgot a '=back' before '$para_type'" 726 ); 727 unshift @$paras, ['=back', {}, ''], $para; # close the =over 728 next; 729 } 730 731 732 if($para_type eq '=item') { 733 734 my $over; 735 unless(@$curr_open and 736 $over = (grep { $_->[0] eq '=over' } @$curr_open)[-1]) { 737 $self->whine( 738 $para->[1]{'start_line'}, 739 "'=item' outside of any '=over'" 740 ); 741 unshift @$paras, 742 ['=over', {'start_line' => $para->[1]{'start_line'}}, ''], 743 $para 744 ; 745 next; 746 } 747 748 749 my $over_type = $over->[1]{'~type'}; 750 751 if(!$over_type) { 752 # Shouldn't happen1 753 die "Typeless over in stack, starting at line " 754 . $over->[1]{'start_line'}; 755 756 } elsif($over_type eq 'block') { 757 unless($curr_open->[-1][1]{'~bitched_about'}) { 758 $curr_open->[-1][1]{'~bitched_about'} = 1; 759 $self->whine( 760 $curr_open->[-1][1]{'start_line'}, 761 "You can't have =items (as at line " 762 . $para->[1]{'start_line'} 763 . ") unless the first thing after the =over is an =item" 764 ); 765 } 766 # Just turn it into a paragraph and reconsider it 767 $para->[0] = '~Para'; 768 unshift @$paras, $para; 769 next; 770 771 } elsif($over_type eq 'text') { 772 my $item_type = $self->_get_item_type($para); 773 # That kills the content of the item if it's a number or bullet. 774 DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n"; 775 776 if($item_type eq 'text') { 777 # Nothing special needs doing for 'text' 778 } elsif($item_type eq 'number' or $item_type eq 'bullet') { 779 $self->whine( 780 $para->[1]{'start_line'}, 781 "Expected text after =item, not a $item_type" 782 ); 783 # Undo our clobbering: 784 push @$para, $para->[1]{'~orig_content'}; 785 delete $para->[1]{'number'}; 786 # Only a PROPER item-number element is allowed 787 # to have a number attribute. 788 } else { 789 die "Unhandled item type $item_type"; # should never happen 790 } 791 792 # =item-text thingies don't need any assimilation, it seems. 793 794 } elsif($over_type eq 'number') { 795 my $item_type = $self->_get_item_type($para); 796 # That kills the content of the item if it's a number or bullet. 797 DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n"; 798 799 my $expected_value = ++ $curr_open->[-1][1]{'~counter'}; 800 801 if($item_type eq 'bullet') { 802 # Hm, it's not numeric. Correct for this. 803 $para->[1]{'number'} = $expected_value; 804 $self->whine( 805 $para->[1]{'start_line'}, 806 "Expected '=item $expected_value'" 807 ); 808 push @$para, $para->[1]{'~orig_content'}; 809 # restore the bullet, blocking the assimilation of next para 810 811 } elsif($item_type eq 'text') { 812 # Hm, it's not numeric. Correct for this. 813 $para->[1]{'number'} = $expected_value; 814 $self->whine( 815 $para->[1]{'start_line'}, 816 "Expected '=item $expected_value'" 817 ); 818 # Text content will still be there and will block next ~Para 819 820 } elsif($item_type ne 'number') { 821 die "Unknown item type $item_type"; # should never happen 822 823 } elsif($expected_value == $para->[1]{'number'}) { 824 DEBUG > 1 and print STDERR " Numeric item has the expected value of $expected_value\n"; 825 826 } else { 827 DEBUG > 1 and print STDERR " Numeric item has ", $para->[1]{'number'}, 828 " instead of the expected value of $expected_value\n"; 829 $self->whine( 830 $para->[1]{'start_line'}, 831 "You have '=item " . $para->[1]{'number'} . 832 "' instead of the expected '=item $expected_value'" 833 ); 834 $para->[1]{'number'} = $expected_value; # correcting!! 835 } 836 837 if(@$para == 2) { 838 # For the cases where we /didn't/ push to @$para 839 if($paras->[0][0] eq '~Para') { 840 DEBUG and print STDERR "Assimilating following ~Para content into $over_type item\n"; 841 push @$para, splice @{shift @$paras},2; 842 } else { 843 DEBUG and print STDERR "Can't assimilate following ", $paras->[0][0], "\n"; 844 push @$para, ''; # Just so it's not contentless 845 } 846 } 847 848 849 } elsif($over_type eq 'bullet') { 850 my $item_type = $self->_get_item_type($para); 851 # That kills the content of the item if it's a number or bullet. 852 DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n"; 853 854 if($item_type eq 'bullet') { 855 # as expected! 856 857 if( $para->[1]{'~_freaky_para_hack'} ) { 858 DEBUG and print STDERR "Accomodating '=item * Foo' tolerance hack.\n"; 859 push @$para, delete $para->[1]{'~_freaky_para_hack'}; 860 } 861 862 } elsif($item_type eq 'number') { 863 $self->whine( 864 $para->[1]{'start_line'}, 865 "Expected '=item *'" 866 ); 867 push @$para, $para->[1]{'~orig_content'}; 868 # and block assimilation of the next paragraph 869 delete $para->[1]{'number'}; 870 # Only a PROPER item-number element is allowed 871 # to have a number attribute. 872 } elsif($item_type eq 'text') { 873 $self->whine( 874 $para->[1]{'start_line'}, 875 "Expected '=item *'" 876 ); 877 # But doesn't need processing. But it'll block assimilation 878 # of the next para. 879 } else { 880 die "Unhandled item type $item_type"; # should never happen 881 } 882 883 if(@$para == 2) { 884 # For the cases where we /didn't/ push to @$para 885 if($paras->[0][0] eq '~Para') { 886 DEBUG and print STDERR "Assimilating following ~Para content into $over_type item\n"; 887 push @$para, splice @{shift @$paras},2; 888 } else { 889 DEBUG and print STDERR "Can't assimilate following ", $paras->[0][0], "\n"; 890 push @$para, ''; # Just so it's not contentless 891 } 892 } 893 894 } else { 895 die "Unhandled =over type \"$over_type\"?"; 896 # Shouldn't happen! 897 } 898 899 $para_type = 'Plain'; 900 $para->[0] .= '-' . $over_type; 901 # Whew. Now fall thru and process it. 902 903 904 } elsif($para_type eq '=extend') { 905 # Well, might as well implement it here. 906 $self->_ponder_extend($para); 907 next; # and skip 908 } elsif($para_type eq '=encoding') { 909 # Not actually acted on here, but we catch errors here. 910 $self->_handle_encoding_second_level($para); 911 next unless $self->keep_encoding_directive; 912 $para_type = 'Plain'; 913 } elsif($para_type eq '~Verbatim') { 914 $para->[0] = 'Verbatim'; 915 $para_type = '?Verbatim'; 916 } elsif($para_type eq '~Para') { 917 $para->[0] = 'Para'; 918 $para_type = '?Plain'; 919 } elsif($para_type eq 'Data') { 920 $para->[0] = 'Data'; 921 $para_type = '?Data'; 922 } elsif( $para_type =~ s/^=//s 923 and defined( $para_type = $self->{'accept_directives'}{$para_type} ) 924 ) { 925 DEBUG > 1 and print STDERR " Pondering known directive ${$para}[0] as $para_type\n"; 926 } else { 927 # An unknown directive! 928 DEBUG > 1 and printf STDERR "Unhandled directive %s (Handled: %s)\n", 929 $para->[0], join(' ', sort keys %{$self->{'accept_directives'}} ) 930 ; 931 $self->whine( 932 $para->[1]{'start_line'}, 933 "Unknown directive: $para->[0]" 934 ); 935 936 # And maybe treat it as text instead of just letting it go? 937 next; 938 } 939 940 if($para_type =~ s/^\?//s) { 941 if(! @$curr_open) { # usual case 942 DEBUG and print STDERR "Treating $para_type paragraph as such because stack is empty.\n"; 943 } else { 944 my @fors = grep $_->[0] eq '=for', @$curr_open; 945 DEBUG > 1 and print STDERR "Containing fors: ", 946 join(',', map $_->[1]{'target'}, @fors), "\n"; 947 948 if(! @fors) { 949 DEBUG and print STDERR "Treating $para_type paragraph as such because stack has no =for's\n"; 950 951 #} elsif(grep $_->[1]{'~resolve'}, @fors) { 952 #} elsif(not grep !$_->[1]{'~resolve'}, @fors) { 953 } elsif( $fors[-1][1]{'~resolve'} ) { 954 # Look to the immediately containing for 955 956 if($para_type eq 'Data') { 957 DEBUG and print STDERR "Treating Data paragraph as Plain/Verbatim because the containing =for ($fors[-1][1]{'target'}) is a resolver\n"; 958 $para->[0] = 'Para'; 959 $para_type = 'Plain'; 960 } else { 961 DEBUG and print STDERR "Treating $para_type paragraph as such because the containing =for ($fors[-1][1]{'target'}) is a resolver\n"; 962 } 963 } else { 964 DEBUG and print STDERR "Treating $para_type paragraph as Data because the containing =for ($fors[-1][1]{'target'}) is a non-resolver\n"; 965 $para->[0] = $para_type = 'Data'; 966 } 967 } 968 } 969 970 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 971 if($para_type eq 'Plain') { 972 $self->_ponder_Plain($para); 973 } elsif($para_type eq 'Verbatim') { 974 $self->_ponder_Verbatim($para); 975 } elsif($para_type eq 'Data') { 976 $self->_ponder_Data($para); 977 } else { 978 die "\$para type is $para_type -- how did that happen?"; 979 # Shouldn't happen. 980 } 981 982 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 983 $para->[0] =~ s/^[~=]//s; 984 985 DEBUG and print STDERR "\n", pretty($para), "\n"; 986 987 # traverse the treelet (which might well be just one string scalar) 988 $self->{'content_seen'} ||= 1; 989 $self->_traverse_treelet_bit(@$para); 990 } 991 } 992 993 return; 994} 995 996########################################################################### 997# The sub-ponderers... 998 999 1000 1001sub _ponder_for { 1002 my ($self,$para,$curr_open,$paras) = @_; 1003 1004 # Fake it out as a begin/end 1005 my $target; 1006 1007 if(grep $_->[1]{'~ignore'}, @$curr_open) { 1008 DEBUG > 1 and print STDERR "Ignoring ignorable =for\n"; 1009 return 1; 1010 } 1011 1012 for(my $i = 2; $i < @$para; ++$i) { 1013 if($para->[$i] =~ s/^\s*(\S+)\s*//s) { 1014 $target = $1; 1015 last; 1016 } 1017 } 1018 unless(defined $target) { 1019 $self->whine( 1020 $para->[1]{'start_line'}, 1021 "=for without a target?" 1022 ); 1023 return 1; 1024 } 1025 DEBUG > 1 and 1026 print STDERR "Faking out a =for $target as a =begin $target / =end $target\n"; 1027 1028 $para->[0] = 'Data'; 1029 1030 unshift @$paras, 1031 ['=begin', 1032 {'start_line' => $para->[1]{'start_line'}, '~really' => '=for'}, 1033 $target, 1034 ], 1035 $para, 1036 ['=end', 1037 {'start_line' => $para->[1]{'start_line'}, '~really' => '=for'}, 1038 $target, 1039 ], 1040 ; 1041 1042 return 1; 1043} 1044 1045sub _ponder_begin { 1046 my ($self,$para,$curr_open,$paras) = @_; 1047 my $content = join ' ', splice @$para, 2; 1048 $content =~ s/^\s+//s; 1049 $content =~ s/\s+$//s; 1050 unless(length($content)) { 1051 $self->whine( 1052 $para->[1]{'start_line'}, 1053 "=begin without a target?" 1054 ); 1055 DEBUG and print STDERR "Ignoring targetless =begin\n"; 1056 return 1; 1057 } 1058 1059 my ($target, $title) = $content =~ m/^(\S+)\s*(.*)$/; 1060 $para->[1]{'title'} = $title if ($title); 1061 $para->[1]{'target'} = $target; # without any ':' 1062 $content = $target; # strip off the title 1063 1064 $content =~ s/^:!/!:/s; 1065 my $neg; # whether this is a negation-match 1066 $neg = 1 if $content =~ s/^!//s; 1067 my $to_resolve; # whether to process formatting codes 1068 $to_resolve = 1 if $content =~ s/^://s; 1069 1070 my $dont_ignore; # whether this target matches us 1071 1072 foreach my $target_name ( 1073 split(',', $content, -1), 1074 $neg ? () : '*' 1075 ) { 1076 DEBUG > 2 and 1077 print STDERR " Considering whether =begin $content matches $target_name\n"; 1078 next unless $self->{'accept_targets'}{$target_name}; 1079 1080 DEBUG > 2 and 1081 print STDERR " It DOES match the acceptable target $target_name!\n"; 1082 $to_resolve = 1 1083 if $self->{'accept_targets'}{$target_name} eq 'force_resolve'; 1084 $dont_ignore = 1; 1085 $para->[1]{'target_matching'} = $target_name; 1086 last; # stop looking at other target names 1087 } 1088 1089 if($neg) { 1090 if( $dont_ignore ) { 1091 $dont_ignore = ''; 1092 delete $para->[1]{'target_matching'}; 1093 DEBUG > 2 and print STDERR " But the leading ! means that this is a NON-match!\n"; 1094 } else { 1095 $dont_ignore = 1; 1096 $para->[1]{'target_matching'} = '!'; 1097 DEBUG > 2 and print STDERR " But the leading ! means that this IS a match!\n"; 1098 } 1099 } 1100 1101 $para->[0] = '=for'; # Just what we happen to call these, internally 1102 $para->[1]{'~really'} ||= '=begin'; 1103 $para->[1]{'~ignore'} = (! $dont_ignore) || 0; 1104 $para->[1]{'~resolve'} = $to_resolve || 0; 1105 1106 DEBUG > 1 and print STDERR " Making note to ", $dont_ignore ? 'not ' : '', 1107 "ignore contents of this region\n"; 1108 DEBUG > 1 and $dont_ignore and print STDERR " Making note to treat contents as ", 1109 ($to_resolve ? 'verbatim/plain' : 'data'), " paragraphs\n"; 1110 DEBUG > 1 and print STDERR " (Stack now: ", $self->_dump_curr_open(), ")\n"; 1111 1112 push @$curr_open, $para; 1113 if(!$dont_ignore or scalar grep $_->[1]{'~ignore'}, @$curr_open) { 1114 DEBUG > 1 and print STDERR "Ignoring ignorable =begin\n"; 1115 } else { 1116 $self->{'content_seen'} ||= 1; 1117 $self->_handle_element_start((my $scratch='for'), $para->[1]); 1118 } 1119 1120 return 1; 1121} 1122 1123sub _ponder_end { 1124 my ($self,$para,$curr_open,$paras) = @_; 1125 my $content = join ' ', splice @$para, 2; 1126 $content =~ s/^\s+//s; 1127 $content =~ s/\s+$//s; 1128 DEBUG and print STDERR "Ogling '=end $content' directive\n"; 1129 1130 unless(length($content)) { 1131 $self->whine( 1132 $para->[1]{'start_line'}, 1133 "'=end' without a target?" . ( 1134 ( @$curr_open and $curr_open->[-1][0] eq '=for' ) 1135 ? ( " (Should be \"=end " . $curr_open->[-1][1]{'target'} . '")' ) 1136 : '' 1137 ) 1138 ); 1139 DEBUG and print STDERR "Ignoring targetless =end\n"; 1140 return 1; 1141 } 1142 1143 unless($content =~ m/^\S+$/) { # i.e., unless it's one word 1144 $self->whine( 1145 $para->[1]{'start_line'}, 1146 "'=end $content' is invalid. (Stack: " 1147 . $self->_dump_curr_open() . ')' 1148 ); 1149 DEBUG and print STDERR "Ignoring mistargetted =end $content\n"; 1150 return 1; 1151 } 1152 1153 unless(@$curr_open and $curr_open->[-1][0] eq '=for') { 1154 $self->whine( 1155 $para->[1]{'start_line'}, 1156 "=end $content without matching =begin. (Stack: " 1157 . $self->_dump_curr_open() . ')' 1158 ); 1159 DEBUG and print STDERR "Ignoring mistargetted =end $content\n"; 1160 return 1; 1161 } 1162 1163 unless($content eq $curr_open->[-1][1]{'target'}) { 1164 $self->whine( 1165 $para->[1]{'start_line'}, 1166 "=end $content doesn't match =begin " 1167 . $curr_open->[-1][1]{'target'} 1168 . ". (Stack: " 1169 . $self->_dump_curr_open() . ')' 1170 ); 1171 DEBUG and print STDERR "Ignoring mistargetted =end $content at line $para->[1]{'start_line'}\n"; 1172 return 1; 1173 } 1174 1175 # Else it's okay to close... 1176 if(grep $_->[1]{'~ignore'}, @$curr_open) { 1177 DEBUG > 1 and print STDERR "Not firing any event for this =end $content because in an ignored region\n"; 1178 # And that may be because of this to-be-closed =for region, or some 1179 # other one, but it doesn't matter. 1180 } else { 1181 $curr_open->[-1][1]{'start_line'} = $para->[1]{'start_line'}; 1182 # what's that for? 1183 1184 $self->{'content_seen'} ||= 1; 1185 $self->_handle_element_end( my $scratch = 'for', $para->[1]); 1186 } 1187 DEBUG > 1 and print STDERR "Popping $curr_open->[-1][0] $curr_open->[-1][1]{'target'} because of =end $content\n"; 1188 pop @$curr_open; 1189 1190 return 1; 1191} 1192 1193sub _ponder_doc_end { 1194 my ($self,$para,$curr_open,$paras) = @_; 1195 if(@$curr_open) { # Deal with things left open 1196 DEBUG and print STDERR "Stack is nonempty at end-document: (", 1197 $self->_dump_curr_open(), ")\n"; 1198 1199 DEBUG > 9 and print STDERR "Stack: ", pretty($curr_open), "\n"; 1200 unshift @$paras, $self->_closers_for_all_curr_open; 1201 # Make sure there is exactly one ~end in the parastack, at the end: 1202 @$paras = grep $_->[0] ne '~end', @$paras; 1203 push @$paras, $para, $para; 1204 # We need two -- once for the next cycle where we 1205 # generate errata, and then another to be at the end 1206 # when that loop back around to process the errata. 1207 return 1; 1208 1209 } else { 1210 DEBUG and print STDERR "Okay, stack is empty now.\n"; 1211 } 1212 1213 # Try generating errata section, if applicable 1214 unless($self->{'~tried_gen_errata'}) { 1215 $self->{'~tried_gen_errata'} = 1; 1216 my @extras = $self->_gen_errata(); 1217 if(@extras) { 1218 unshift @$paras, @extras; 1219 DEBUG and print STDERR "Generated errata... relooping...\n"; 1220 return 1; # I.e., loop around again to process these fake-o paragraphs 1221 } 1222 } 1223 1224 splice @$paras; # Well, that's that for this paragraph buffer. 1225 DEBUG and print STDERR "Throwing end-document event.\n"; 1226 1227 $self->_handle_element_end( my $scratch = 'Document' ); 1228 return 1; # Hasta la byebye 1229} 1230 1231sub _ponder_pod { 1232 my ($self,$para,$curr_open,$paras) = @_; 1233 $self->whine( 1234 $para->[1]{'start_line'}, 1235 "=pod directives shouldn't be over one line long! Ignoring all " 1236 . (@$para - 2) . " lines of content" 1237 ) if @$para > 3; 1238 1239 # Content ignored unless 'pod_handler' is set 1240 if (my $pod_handler = $self->{'pod_handler'}) { 1241 my ($line_num, $line) = map $_, $para->[1]{'start_line'}, $para->[2]; 1242 $line = $line eq '' ? "=pod" : "=pod $line"; # imitate cut_handler output 1243 $pod_handler->($line, $line_num, $self); 1244 } 1245 1246 # The surrounding methods set content_seen, so let us remain consistent. 1247 # I do not know why it was not here before -- should it not be here? 1248 # $self->{'content_seen'} ||= 1; 1249 1250 return; 1251} 1252 1253sub _ponder_over { 1254 my ($self,$para,$curr_open,$paras) = @_; 1255 return 1 unless @$paras; 1256 my $list_type; 1257 1258 if($paras->[0][0] eq '=item') { # most common case 1259 $list_type = $self->_get_initial_item_type($paras->[0]); 1260 1261 } elsif($paras->[0][0] eq '=back') { 1262 # Ignore empty lists by default 1263 if ($self->{'parse_empty_lists'}) { 1264 $list_type = 'empty'; 1265 } else { 1266 shift @$paras; 1267 return 1; 1268 } 1269 } elsif($paras->[0][0] eq '~end') { 1270 $self->whine( 1271 $para->[1]{'start_line'}, 1272 "=over is the last thing in the document?!" 1273 ); 1274 return 1; # But feh, ignore it. 1275 } else { 1276 $list_type = 'block'; 1277 } 1278 $para->[1]{'~type'} = $list_type; 1279 push @$curr_open, $para; 1280 # yes, we reuse the paragraph as a stack item 1281 1282 my $content = join ' ', splice @$para, 2; 1283 my $overness; 1284 if($content =~ m/^\s*$/s) { 1285 $para->[1]{'indent'} = 4; 1286 } elsif($content =~ m/^\s*((?:\d*\.)?\d+)\s*$/s) { 1287 no integer; 1288 $para->[1]{'indent'} = $1; 1289 if($1 == 0) { 1290 $self->whine( 1291 $para->[1]{'start_line'}, 1292 "Can't have a 0 in =over $content" 1293 ); 1294 $para->[1]{'indent'} = 4; 1295 } 1296 } else { 1297 $self->whine( 1298 $para->[1]{'start_line'}, 1299 "=over should be: '=over' or '=over positive_number'" 1300 ); 1301 $para->[1]{'indent'} = 4; 1302 } 1303 DEBUG > 1 and print STDERR "=over found of type $list_type\n"; 1304 1305 $self->{'content_seen'} ||= 1; 1306 $self->_handle_element_start((my $scratch = 'over-' . $list_type), $para->[1]); 1307 1308 return; 1309} 1310 1311sub _ponder_back { 1312 my ($self,$para,$curr_open,$paras) = @_; 1313 # TODO: fire off </item-number> or </item-bullet> or </item-text> ?? 1314 1315 my $content = join ' ', splice @$para, 2; 1316 if($content =~ m/\S/) { 1317 $self->whine( 1318 $para->[1]{'start_line'}, 1319 "=back doesn't take any parameters, but you said =back $content" 1320 ); 1321 } 1322 1323 if(@$curr_open and $curr_open->[-1][0] eq '=over') { 1324 DEBUG > 1 and print STDERR "=back happily closes matching =over\n"; 1325 # Expected case: we're closing the most recently opened thing 1326 #my $over = pop @$curr_open; 1327 $self->{'content_seen'} ||= 1; 1328 $self->_handle_element_end( my $scratch = 1329 'over-' . ( (pop @$curr_open)->[1]{'~type'} ), $para->[1] 1330 ); 1331 } else { 1332 DEBUG > 1 and print STDERR "=back found without a matching =over. Stack: (", 1333 join(', ', map $_->[0], @$curr_open), ").\n"; 1334 $self->whine( 1335 $para->[1]{'start_line'}, 1336 '=back without =over' 1337 ); 1338 return 1; # and ignore it 1339 } 1340} 1341 1342sub _ponder_item { 1343 my ($self,$para,$curr_open,$paras) = @_; 1344 my $over; 1345 unless(@$curr_open and 1346 $over = (grep { $_->[0] eq '=over' } @$curr_open)[-1]) { 1347 $self->whine( 1348 $para->[1]{'start_line'}, 1349 "'=item' outside of any '=over'" 1350 ); 1351 unshift @$paras, 1352 ['=over', {'start_line' => $para->[1]{'start_line'}}, ''], 1353 $para 1354 ; 1355 return 1; 1356 } 1357 1358 1359 my $over_type = $over->[1]{'~type'}; 1360 1361 if(!$over_type) { 1362 # Shouldn't happen1 1363 die "Typeless over in stack, starting at line " 1364 . $over->[1]{'start_line'}; 1365 1366 } elsif($over_type eq 'block') { 1367 unless($curr_open->[-1][1]{'~bitched_about'}) { 1368 $curr_open->[-1][1]{'~bitched_about'} = 1; 1369 $self->whine( 1370 $curr_open->[-1][1]{'start_line'}, 1371 "You can't have =items (as at line " 1372 . $para->[1]{'start_line'} 1373 . ") unless the first thing after the =over is an =item" 1374 ); 1375 } 1376 # Just turn it into a paragraph and reconsider it 1377 $para->[0] = '~Para'; 1378 unshift @$paras, $para; 1379 return 1; 1380 1381 } elsif($over_type eq 'text') { 1382 my $item_type = $self->_get_item_type($para); 1383 # That kills the content of the item if it's a number or bullet. 1384 DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n"; 1385 1386 if($item_type eq 'text') { 1387 # Nothing special needs doing for 'text' 1388 } elsif($item_type eq 'number' or $item_type eq 'bullet') { 1389 $self->whine( 1390 $para->[1]{'start_line'}, 1391 "Expected text after =item, not a $item_type" 1392 ); 1393 # Undo our clobbering: 1394 push @$para, $para->[1]{'~orig_content'}; 1395 delete $para->[1]{'number'}; 1396 # Only a PROPER item-number element is allowed 1397 # to have a number attribute. 1398 } else { 1399 die "Unhandled item type $item_type"; # should never happen 1400 } 1401 1402 # =item-text thingies don't need any assimilation, it seems. 1403 1404 } elsif($over_type eq 'number') { 1405 my $item_type = $self->_get_item_type($para); 1406 # That kills the content of the item if it's a number or bullet. 1407 DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n"; 1408 1409 my $expected_value = ++ $curr_open->[-1][1]{'~counter'}; 1410 1411 if($item_type eq 'bullet') { 1412 # Hm, it's not numeric. Correct for this. 1413 $para->[1]{'number'} = $expected_value; 1414 $self->whine( 1415 $para->[1]{'start_line'}, 1416 "Expected '=item $expected_value'" 1417 ); 1418 push @$para, $para->[1]{'~orig_content'}; 1419 # restore the bullet, blocking the assimilation of next para 1420 1421 } elsif($item_type eq 'text') { 1422 # Hm, it's not numeric. Correct for this. 1423 $para->[1]{'number'} = $expected_value; 1424 $self->whine( 1425 $para->[1]{'start_line'}, 1426 "Expected '=item $expected_value'" 1427 ); 1428 # Text content will still be there and will block next ~Para 1429 1430 } elsif($item_type ne 'number') { 1431 die "Unknown item type $item_type"; # should never happen 1432 1433 } elsif($expected_value == $para->[1]{'number'}) { 1434 DEBUG > 1 and print STDERR " Numeric item has the expected value of $expected_value\n"; 1435 1436 } else { 1437 DEBUG > 1 and print STDERR " Numeric item has ", $para->[1]{'number'}, 1438 " instead of the expected value of $expected_value\n"; 1439 $self->whine( 1440 $para->[1]{'start_line'}, 1441 "You have '=item " . $para->[1]{'number'} . 1442 "' instead of the expected '=item $expected_value'" 1443 ); 1444 $para->[1]{'number'} = $expected_value; # correcting!! 1445 } 1446 1447 if(@$para == 2) { 1448 # For the cases where we /didn't/ push to @$para 1449 if($paras->[0][0] eq '~Para') { 1450 DEBUG and print STDERR "Assimilating following ~Para content into $over_type item\n"; 1451 push @$para, splice @{shift @$paras},2; 1452 } else { 1453 DEBUG and print STDERR "Can't assimilate following ", $paras->[0][0], "\n"; 1454 push @$para, ''; # Just so it's not contentless 1455 } 1456 } 1457 1458 1459 } elsif($over_type eq 'bullet') { 1460 my $item_type = $self->_get_item_type($para); 1461 # That kills the content of the item if it's a number or bullet. 1462 DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n"; 1463 1464 if($item_type eq 'bullet') { 1465 # as expected! 1466 1467 if( $para->[1]{'~_freaky_para_hack'} ) { 1468 DEBUG and print STDERR "Accomodating '=item * Foo' tolerance hack.\n"; 1469 push @$para, delete $para->[1]{'~_freaky_para_hack'}; 1470 } 1471 1472 } elsif($item_type eq 'number') { 1473 $self->whine( 1474 $para->[1]{'start_line'}, 1475 "Expected '=item *'" 1476 ); 1477 push @$para, $para->[1]{'~orig_content'}; 1478 # and block assimilation of the next paragraph 1479 delete $para->[1]{'number'}; 1480 # Only a PROPER item-number element is allowed 1481 # to have a number attribute. 1482 } elsif($item_type eq 'text') { 1483 $self->whine( 1484 $para->[1]{'start_line'}, 1485 "Expected '=item *'" 1486 ); 1487 # But doesn't need processing. But it'll block assimilation 1488 # of the next para. 1489 } else { 1490 die "Unhandled item type $item_type"; # should never happen 1491 } 1492 1493 if(@$para == 2) { 1494 # For the cases where we /didn't/ push to @$para 1495 if($paras->[0][0] eq '~Para') { 1496 DEBUG and print STDERR "Assimilating following ~Para content into $over_type item\n"; 1497 push @$para, splice @{shift @$paras},2; 1498 } else { 1499 DEBUG and print STDERR "Can't assimilate following ", $paras->[0][0], "\n"; 1500 push @$para, ''; # Just so it's not contentless 1501 } 1502 } 1503 1504 } else { 1505 die "Unhandled =over type \"$over_type\"?"; 1506 # Shouldn't happen! 1507 } 1508 $para->[0] .= '-' . $over_type; 1509 1510 return; 1511} 1512 1513sub _ponder_Plain { 1514 my ($self,$para) = @_; 1515 DEBUG and print STDERR " giving plain treatment...\n"; 1516 unless( @$para == 2 or ( @$para == 3 and $para->[2] eq '' ) 1517 or $para->[1]{'~cooked'} 1518 ) { 1519 push @$para, 1520 @{$self->_make_treelet( 1521 join("\n", splice(@$para, 2)), 1522 $para->[1]{'start_line'} 1523 )}; 1524 } 1525 # Empty paragraphs don't need a treelet for any reason I can see. 1526 # And precooked paragraphs already have a treelet. 1527 return; 1528} 1529 1530sub _ponder_Verbatim { 1531 my ($self,$para) = @_; 1532 DEBUG and print STDERR " giving verbatim treatment...\n"; 1533 1534 $para->[1]{'xml:space'} = 'preserve'; 1535 1536 my $indent = $self->strip_verbatim_indent; 1537 if ($indent && ref $indent eq 'CODE') { 1538 my @shifted = (shift @{$para}, shift @{$para}); 1539 $indent = $indent->($para); 1540 unshift @{$para}, @shifted; 1541 } 1542 1543 for(my $i = 2; $i < @$para; $i++) { 1544 foreach my $line ($para->[$i]) { # just for aliasing 1545 # Strip indentation. 1546 $line =~ s/^\Q$indent// if $indent 1547 && !($self->{accept_codes} && $self->{accept_codes}{VerbatimFormatted}); 1548 while( $line =~ 1549 # Sort of adapted from Text::Tabs -- yes, it's hardwired in that 1550 # tabs are at every EIGHTH column. For portability, it has to be 1551 # one setting everywhere, and 8th wins. 1552 s/^([^\t]*)(\t+)/$1.(" " x ((length($2)<<3)-(length($1)&7)))/e 1553 ) {} 1554 1555 # TODO: whinge about (or otherwise treat) unindented or overlong lines 1556 1557 } 1558 } 1559 1560 # Now the VerbatimFormatted hoodoo... 1561 if( $self->{'accept_codes'} and 1562 $self->{'accept_codes'}{'VerbatimFormatted'} 1563 ) { 1564 while(@$para > 3 and $para->[-1] !~ m/\S/) { pop @$para } 1565 # Kill any number of terminal newlines 1566 $self->_verbatim_format($para); 1567 } elsif ($self->{'codes_in_verbatim'}) { 1568 push @$para, 1569 @{$self->_make_treelet( 1570 join("\n", splice(@$para, 2)), 1571 $para->[1]{'start_line'}, $para->[1]{'xml:space'} 1572 )}; 1573 $para->[-1] =~ s/\n+$//s; # Kill any number of terminal newlines 1574 } else { 1575 push @$para, join "\n", splice(@$para, 2) if @$para > 3; 1576 $para->[-1] =~ s/\n+$//s; # Kill any number of terminal newlines 1577 } 1578 return; 1579} 1580 1581sub _ponder_Data { 1582 my ($self,$para) = @_; 1583 DEBUG and print STDERR " giving data treatment...\n"; 1584 $para->[1]{'xml:space'} = 'preserve'; 1585 push @$para, join "\n", splice(@$para, 2) if @$para > 3; 1586 return; 1587} 1588 1589 1590 1591 1592########################################################################### 1593 1594sub _traverse_treelet_bit { # for use only by the routine above 1595 my($self, $name) = splice @_,0,2; 1596 1597 my $scratch; 1598 $self->_handle_element_start(($scratch=$name), shift @_); 1599 1600 while (@_) { 1601 my $x = shift; 1602 if (ref($x)) { 1603 &_traverse_treelet_bit($self, @$x); 1604 } else { 1605 $x .= shift while @_ && !ref($_[0]); 1606 $self->_handle_text($x); 1607 } 1608 } 1609 1610 $self->_handle_element_end($scratch=$name); 1611 return; 1612} 1613 1614#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 1615 1616sub _closers_for_all_curr_open { 1617 my $self = $_[0]; 1618 my @closers; 1619 foreach my $still_open (@{ $self->{'curr_open'} || return }) { 1620 my @copy = @$still_open; 1621 $copy[1] = {%{ $copy[1] }}; 1622 #$copy[1]{'start_line'} = -1; 1623 if($copy[0] eq '=for') { 1624 $copy[0] = '=end'; 1625 } elsif($copy[0] eq '=over') { 1626 $self->whine( 1627 $still_open->[1]{start_line} , 1628 "=over without closing =back" 1629 ); 1630 1631 $copy[0] = '=back'; 1632 } else { 1633 die "I don't know how to auto-close an open $copy[0] region"; 1634 } 1635 1636 unless( @copy > 2 ) { 1637 push @copy, $copy[1]{'target'}; 1638 $copy[-1] = '' unless defined $copy[-1]; 1639 # since =over's don't have targets 1640 } 1641 1642 $copy[1]{'fake-closer'} = 1; 1643 1644 DEBUG and print STDERR "Queuing up fake-o event: ", pretty(\@copy), "\n"; 1645 unshift @closers, \@copy; 1646 } 1647 return @closers; 1648} 1649 1650#-------------------------------------------------------------------------- 1651 1652sub _verbatim_format { 1653 my($it, $p) = @_; 1654 1655 my $formatting; 1656 1657 for(my $i = 2; $i < @$p; $i++) { # work backwards over the lines 1658 DEBUG and print STDERR "_verbatim_format appends a newline to $i: $p->[$i]\n"; 1659 $p->[$i] .= "\n"; 1660 # Unlike with simple Verbatim blocks, we don't end up just doing 1661 # a join("\n", ...) on the contents, so we have to append a 1662 # newline to ever line, and then nix the last one later. 1663 } 1664 1665 if( DEBUG > 4 ) { 1666 print STDERR "<<\n"; 1667 for(my $i = $#$p; $i >= 2; $i--) { # work backwards over the lines 1668 print STDERR "_verbatim_format $i: $p->[$i]"; 1669 } 1670 print STDERR ">>\n"; 1671 } 1672 1673 for(my $i = $#$p; $i > 2; $i--) { 1674 # work backwards over the lines, except the first (#2) 1675 1676 #next unless $p->[$i] =~ m{^#:([ \^\/\%]*)\n?$}s 1677 # and $p->[$i-1] !~ m{^#:[ \^\/\%]*\n?$}s; 1678 # look at a formatty line preceding a nonformatty one 1679 DEBUG > 5 and print STDERR "Scrutinizing line $i: $$p[$i]\n"; 1680 if($p->[$i] =~ m{^#:([ \^\/\%]*)\n?$}s) { 1681 DEBUG > 5 and print STDERR " It's a formatty line. ", 1682 "Peeking at previous line ", $i-1, ": $$p[$i-1]: \n"; 1683 1684 if( $p->[$i-1] =~ m{^#:[ \^\/\%]*\n?$}s ) { 1685 DEBUG > 5 and print STDERR " Previous line is formatty! Skipping this one.\n"; 1686 next; 1687 } else { 1688 DEBUG > 5 and print STDERR " Previous line is non-formatty! Yay!\n"; 1689 } 1690 } else { 1691 DEBUG > 5 and print STDERR " It's not a formatty line. Ignoring\n"; 1692 next; 1693 } 1694 1695 # A formatty line has to have #: in the first two columns, and uses 1696 # "^" to mean bold, "/" to mean underline, and "%" to mean bold italic. 1697 # Example: 1698 # What do you want? i like pie. [or whatever] 1699 # #:^^^^^^^^^^^^^^^^^ ///////////// 1700 1701 1702 DEBUG > 4 and print STDERR "_verbatim_format considers:\n<$p->[$i-1]>\n<$p->[$i]>\n"; 1703 1704 $formatting = ' ' . $1; 1705 $formatting =~ s/\s+$//s; # nix trailing whitespace 1706 unless(length $formatting and $p->[$i-1] =~ m/\S/) { # no-op 1707 splice @$p,$i,1; # remove this line 1708 $i--; # don't consider next line 1709 next; 1710 } 1711 1712 if( length($formatting) >= length($p->[$i-1]) ) { 1713 $formatting = substr($formatting, 0, length($p->[$i-1]) - 1) . ' '; 1714 } else { 1715 $formatting .= ' ' x (length($p->[$i-1]) - length($formatting)); 1716 } 1717 # Make $formatting and the previous line be exactly the same length, 1718 # with $formatting having a " " as the last character. 1719 1720 DEBUG > 4 and print STDERR "Formatting <$formatting> on <", $p->[$i-1], ">\n"; 1721 1722 1723 my @new_line; 1724 while( $formatting =~ m{\G(( +)|(\^+)|(\/+)|(\%+))}g ) { 1725 #print STDERR "Format matches $1\n"; 1726 1727 if($2) { 1728 #print STDERR "SKIPPING <$2>\n"; 1729 push @new_line, 1730 substr($p->[$i-1], pos($formatting)-length($1), length($1)); 1731 } else { 1732 #print STDERR "SNARING $+\n"; 1733 push @new_line, [ 1734 ( 1735 $3 ? 'VerbatimB' : 1736 $4 ? 'VerbatimI' : 1737 $5 ? 'VerbatimBI' : die("Should never get called") 1738 ), {}, 1739 substr($p->[$i-1], pos($formatting)-length($1), length($1)) 1740 ]; 1741 #print STDERR "Formatting <$new_line[-1][-1]> as $new_line[-1][0]\n"; 1742 } 1743 } 1744 my @nixed = 1745 splice @$p, $i-1, 2, @new_line; # replace myself and the next line 1746 DEBUG > 10 and print STDERR "Nixed count: ", scalar(@nixed), "\n"; 1747 1748 DEBUG > 6 and print STDERR "New version of the above line is these tokens (", 1749 scalar(@new_line), "):", 1750 map( ref($_)?"<@$_> ":"<$_>", @new_line ), "\n"; 1751 $i--; # So the next line we scrutinize is the line before the one 1752 # that we just went and formatted 1753 } 1754 1755 $p->[0] = 'VerbatimFormatted'; 1756 1757 # Collapse adjacent text nodes, just for kicks. 1758 for( my $i = 2; $i > $#$p; $i++ ) { # work forwards over the tokens except for the last 1759 if( !ref($p->[$i]) and !ref($p->[$i + 1]) ) { 1760 DEBUG > 5 and print STDERR "_verbatim_format merges {$p->[$i]} and {$p->[$i+1]}\n"; 1761 $p->[$i] .= splice @$p, $i+1, 1; # merge 1762 --$i; # and back up 1763 } 1764 } 1765 1766 # Now look for the last text token, and remove the terminal newline 1767 for( my $i = $#$p; $i >= 2; $i-- ) { 1768 # work backwards over the tokens, even the first 1769 if( !ref($p->[$i]) ) { 1770 if($p->[$i] =~ s/\n$//s) { 1771 DEBUG > 5 and print STDERR "_verbatim_format killed the terminal newline on #$i: {$p->[$i]}, after {$p->[$i-1]}\n"; 1772 } else { 1773 DEBUG > 5 and print STDERR 1774 "No terminal newline on #$i: {$p->[$i]}, after {$p->[$i-1]} !?\n"; 1775 } 1776 last; # we only want the next one 1777 } 1778 } 1779 1780 return; 1781} 1782 1783 1784#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 1785 1786 1787sub _treelet_from_formatting_codes { 1788 # Given a paragraph, returns a treelet. Full of scary tokenizing code. 1789 # Like [ '~Top', {'start_line' => $start_line}, 1790 # "I like ", 1791 # [ 'B', {}, "pie" ], 1792 # "!" 1793 # ] 1794 1795 my($self, $para, $start_line, $preserve_space) = @_; 1796 1797 my $treelet = ['~Top', {'start_line' => $start_line},]; 1798 1799 unless ($preserve_space || $self->{'preserve_whitespace'}) { 1800 $para =~ s/\s+/ /g; # collapse and trim all whitespace first. 1801 $para =~ s/ $//; 1802 $para =~ s/^ //; 1803 } 1804 1805 # Only apparent problem the above code is that N<< >> turns into 1806 # N<< >>. But then, word wrapping does that too! So don't do that! 1807 1808 my @stack; 1809 my @lineage = ($treelet); 1810 my $raw = ''; # raw content of L<> fcode before splitting/processing 1811 # XXX 'raw' is not 100% accurate: all surrounding whitespace is condensed 1812 # into just 1 ' '. Is this the regex's doing or 'raw's? 1813 my $inL = 0; 1814 1815 DEBUG > 4 and print STDERR "Paragraph:\n$para\n\n"; 1816 1817 # Here begins our frightening tokenizer RE. The following regex matches 1818 # text in four main parts: 1819 # 1820 # * Start-codes. The first alternative matches C< or C<<, the latter 1821 # followed by some whitespace. $1 will hold the entire start code 1822 # (including any space following a multiple-angle-bracket delimiter), 1823 # and $2 will hold only the additional brackets past the first in a 1824 # multiple-bracket delimiter. length($2) + 1 will be the number of 1825 # closing brackets we have to find. 1826 # 1827 # * Closing brackets. Match some amount of whitespace followed by 1828 # multiple close brackets. The logic to see if this closes anything 1829 # is down below. Note that in order to parse C<< >> correctly, we 1830 # have to use look-behind (?<=\s\s), since the match of the starting 1831 # code will have consumed the whitespace. 1832 # 1833 # * A single closing bracket, to close a simple code like C<>. 1834 # 1835 # * Something that isn't a start or end code. We have to be careful 1836 # about accepting whitespace, since perlpodspec says that any whitespace 1837 # before a multiple-bracket closing delimiter should be ignored. 1838 # 1839 while($para =~ 1840 m/\G 1841 (?: 1842 # Match starting codes, including the whitespace following a 1843 # multiple-delimiter start code. $1 gets the whole start code and 1844 # $2 gets all but one of the <s in the multiple-bracket case. 1845 ([A-Z]<(?:(<+)\s+)?) 1846 | 1847 # Match multiple-bracket end codes. $3 gets the whitespace that 1848 # should be discarded before an end bracket but kept in other cases 1849 # and $4 gets the end brackets themselves. 1850 (\s+|(?<=\s\s))(>{2,}) 1851 | 1852 (\s?>) # $5: simple end-codes 1853 | 1854 ( # $6: stuff containing no start-codes or end-codes 1855 (?: 1856 [^A-Z\s>] 1857 | 1858 (?: 1859 [A-Z](?!<) 1860 ) 1861 | 1862 # whitespace is ok, but we don't want to eat the whitespace before 1863 # a multiple-bracket end code. 1864 # NOTE: we may still have problems with e.g. S<< >> 1865 (?: 1866 \s(?!\s*>{2,}) 1867 ) 1868 )+ 1869 ) 1870 ) 1871 /xgo 1872 ) { 1873 DEBUG > 4 and print STDERR "\nParagraphic tokenstack = (@stack)\n"; 1874 if(defined $1) { 1875 if(defined $2) { 1876 DEBUG > 3 and print STDERR "Found complex start-text code \"$1\"\n"; 1877 push @stack, length($2) + 1; 1878 # length of the necessary complex end-code string 1879 } else { 1880 DEBUG > 3 and print STDERR "Found simple start-text code \"$1\"\n"; 1881 push @stack, 0; # signal that we're looking for simple 1882 } 1883 push @lineage, [ substr($1,0,1), {}, ]; # new node object 1884 push @{ $lineage[-2] }, $lineage[-1]; 1885 if ('L' eq substr($1,0,1)) { 1886 $raw = $inL ? $raw.$1 : ''; # reset raw content accumulator 1887 $inL = 1; 1888 } else { 1889 $raw .= $1 if $inL; 1890 } 1891 1892 } elsif(defined $4) { 1893 DEBUG > 3 and print STDERR "Found apparent complex end-text code \"$3$4\"\n"; 1894 # This is where it gets messy... 1895 if(! @stack) { 1896 # We saw " >>>>" but needed nothing. This is ALL just stuff then. 1897 DEBUG > 4 and print STDERR " But it's really just stuff.\n"; 1898 push @{ $lineage[-1] }, $3, $4; 1899 next; 1900 } elsif(!$stack[-1]) { 1901 # We saw " >>>>" but needed only ">". Back pos up. 1902 DEBUG > 4 and print STDERR " And that's more than we needed to close simple.\n"; 1903 push @{ $lineage[-1] }, $3; # That was a for-real space, too. 1904 pos($para) = pos($para) - length($4) + 1; 1905 } elsif($stack[-1] == length($4)) { 1906 # We found " >>>>", and it was exactly what we needed. Commonest case. 1907 DEBUG > 4 and print STDERR " And that's exactly what we needed to close complex.\n"; 1908 } elsif($stack[-1] < length($4)) { 1909 # We saw " >>>>" but needed only " >>". Back pos up. 1910 DEBUG > 4 and print STDERR " And that's more than we needed to close complex.\n"; 1911 pos($para) = pos($para) - length($4) + $stack[-1]; 1912 } else { 1913 # We saw " >>>>" but needed " >>>>>>". So this is all just stuff! 1914 DEBUG > 4 and print STDERR " But it's really just stuff, because we needed more.\n"; 1915 push @{ $lineage[-1] }, $3, $4; 1916 next; 1917 } 1918 #print STDERR "\nHOOBOY ", scalar(@{$lineage[-1]}), "!!!\n"; 1919 1920 push @{ $lineage[-1] }, '' if 2 == @{ $lineage[-1] }; 1921 # Keep the element from being childless 1922 1923 pop @stack; 1924 pop @lineage; 1925 1926 unless (@stack) { # not in an L if there are no open fcodes 1927 $inL = 0; 1928 if (ref $lineage[-1][-1] && $lineage[-1][-1][0] eq 'L') { 1929 $lineage[-1][-1][1]{'raw'} = $raw 1930 } 1931 } 1932 $raw .= $3.$4 if $inL; 1933 1934 } elsif(defined $5) { 1935 DEBUG > 3 and print STDERR "Found apparent simple end-text code \"$5\"\n"; 1936 1937 if(@stack and ! $stack[-1]) { 1938 # We're indeed expecting a simple end-code 1939 DEBUG > 4 and print STDERR " It's indeed an end-code.\n"; 1940 1941 if(length($5) == 2) { # There was a space there: " >" 1942 push @{ $lineage[-1] }, ' '; 1943 } elsif( 2 == @{ $lineage[-1] } ) { # Closing a childless element 1944 push @{ $lineage[-1] }, ''; # keep it from being really childless 1945 } 1946 1947 pop @stack; 1948 pop @lineage; 1949 } else { 1950 DEBUG > 4 and print STDERR " It's just stuff.\n"; 1951 push @{ $lineage[-1] }, $5; 1952 } 1953 1954 unless (@stack) { # not in an L if there are no open fcodes 1955 $inL = 0; 1956 if (ref $lineage[-1][-1] && $lineage[-1][-1][0] eq 'L') { 1957 $lineage[-1][-1][1]{'raw'} = $raw 1958 } 1959 } 1960 $raw .= $5 if $inL; 1961 1962 } elsif(defined $6) { 1963 DEBUG > 3 and print STDERR "Found stuff \"$6\"\n"; 1964 push @{ $lineage[-1] }, $6; 1965 $raw .= $6 if $inL; 1966 # XXX does not capture multiplace whitespaces -- 'raw' ends up with 1967 # at most 1 leading/trailing whitespace, why not all of it? 1968 1969 } else { 1970 # should never ever ever ever happen 1971 DEBUG and print STDERR "AYYAYAAAAA at line ", __LINE__, "\n"; 1972 die "SPORK 512512!"; 1973 } 1974 } 1975 1976 if(@stack) { # Uhoh, some sequences weren't closed. 1977 my $x= "..."; 1978 while(@stack) { 1979 push @{ $lineage[-1] }, '' if 2 == @{ $lineage[-1] }; 1980 # Hmmmmm! 1981 1982 my $code = (pop @lineage)->[0]; 1983 my $ender_length = pop @stack; 1984 if($ender_length) { 1985 --$ender_length; 1986 $x = $code . ("<" x $ender_length) . " $x " . (">" x $ender_length); 1987 } else { 1988 $x = $code . "<$x>"; 1989 } 1990 } 1991 DEBUG > 1 and print STDERR "Unterminated $x sequence\n"; 1992 $self->whine($start_line, 1993 "Unterminated $x sequence", 1994 ); 1995 } 1996 1997 return $treelet; 1998} 1999 2000#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 2001 2002sub text_content_of_treelet { # method: $parser->text_content_of_treelet($lol) 2003 return stringify_lol($_[1]); 2004} 2005 2006sub stringify_lol { # function: stringify_lol($lol) 2007 my $string_form = ''; 2008 _stringify_lol( $_[0] => \$string_form ); 2009 return $string_form; 2010} 2011 2012sub _stringify_lol { # the real recursor 2013 my($lol, $to) = @_; 2014 for(my $i = 2; $i < @$lol; ++$i) { 2015 if( ref($lol->[$i] || '') and UNIVERSAL::isa($lol->[$i], 'ARRAY') ) { 2016 _stringify_lol( $lol->[$i], $to); # recurse! 2017 } else { 2018 $$to .= $lol->[$i]; 2019 } 2020 } 2021 return; 2022} 2023 2024#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 2025 2026sub _dump_curr_open { # return a string representation of the stack 2027 my $curr_open = $_[0]{'curr_open'}; 2028 2029 return '[empty]' unless @$curr_open; 2030 return join '; ', 2031 map {; 2032 ($_->[0] eq '=for') 2033 ? ( ($_->[1]{'~really'} || '=over') 2034 . ' ' . $_->[1]{'target'}) 2035 : $_->[0] 2036 } 2037 @$curr_open 2038 ; 2039} 2040 2041########################################################################### 2042my %pretty_form = ( 2043 "\a" => '\a', # ding! 2044 "\b" => '\b', # BS 2045 "\e" => '\e', # ESC 2046 "\f" => '\f', # FF 2047 "\t" => '\t', # tab 2048 "\cm" => '\cm', 2049 "\cj" => '\cj', 2050 "\n" => '\n', # probably overrides one of either \cm or \cj 2051 '"' => '\"', 2052 '\\' => '\\\\', 2053 '$' => '\\$', 2054 '@' => '\\@', 2055 '%' => '\\%', 2056 '#' => '\\#', 2057); 2058 2059sub pretty { # adopted from Class::Classless 2060 # Not the most brilliant routine, but passable. 2061 # Don't give it a cyclic data structure! 2062 my @stuff = @_; # copy 2063 my $x; 2064 my $out = 2065 # join ",\n" . 2066 join ", ", 2067 map {; 2068 if(!defined($_)) { 2069 "undef"; 2070 } elsif(ref($_) eq 'ARRAY' or ref($_) eq 'Pod::Simple::LinkSection') { 2071 $x = "[ " . pretty(@$_) . " ]" ; 2072 $x; 2073 } elsif(ref($_) eq 'SCALAR') { 2074 $x = "\\" . pretty($$_) ; 2075 $x; 2076 } elsif(ref($_) eq 'HASH') { 2077 my $hr = $_; 2078 $x = "{" . join(", ", 2079 map(pretty($_) . '=>' . pretty($hr->{$_}), 2080 sort keys %$hr ) ) . "}" ; 2081 $x; 2082 } elsif(!length($_)) { q{''} # empty string 2083 } elsif( 2084 $_ eq '0' # very common case 2085 or( 2086 m/^-?(?:[123456789]\d*|0)(?:\.\d+)?$/s 2087 and $_ ne '-0' # the strange case that RE lets thru 2088 ) 2089 ) { $_; 2090 } else { 2091 # Yes, explicitly name every character desired. There are shorcuts one 2092 # could make, but I (Karl Williamson) was afraid that some Perl 2093 # releases would have bugs in some of them. For example [A-Z] works 2094 # even on EBCDIC platforms to match exactly the 26 uppercase English 2095 # letters, but I don't know if it has always worked without bugs. It 2096 # seemed safest just to list the characters. 2097 # s<([^\x20\x21\x23\x27-\x3F\x41-\x5B\x5D-\x7E])> 2098 s<([^ !#'()*+,\-./0123456789:;\<=\>?ABCDEFGHIJKLMNOPQRSTUVWXYZ\[\]^_`abcdefghijklmnopqrstuvwxyz{|}~])> 2099 <$pretty_form{$1} || '\\x{'.sprintf("%x", ord($1)).'}'>eg; 2100 #<$pretty_form{$1} || '\\x'.(unpack("H2",$1))>eg; 2101 qq{"$_"}; 2102 } 2103 } @stuff; 2104 # $out =~ s/\n */ /g if length($out) < 75; 2105 return $out; 2106} 2107 2108#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 2109 2110# A rather unsubtle method of blowing away all the state information 2111# from a parser object so it can be reused. Provided as a utility for 2112# backward compatibility in Pod::Man, etc. but not recommended for 2113# general use. 2114 2115sub reinit { 2116 my $self = shift; 2117 foreach (qw(source_dead source_filename doc_has_started 2118start_of_pod_block content_seen last_was_blank paras curr_open 2119line_count pod_para_count in_pod ~tried_gen_errata all_errata errata errors_seen 2120Title)) { 2121 2122 delete $self->{$_}; 2123 } 2124} 2125 2126#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 21271; 2128 2129