1package Pod::Simple::BlackBox;
2#
3# "What's in the box?"  "Pain."
4#
5###########################################################################
6#
7# This is where all the scary things happen: parsing lines into
8#  paragraphs; and then into directives, verbatims, and then also
9#  turning formatting sequences into treelets.
10#
11# Are you really sure you want to read this code?
12#
13#-----------------------------------------------------------------------------
14#
15# The basic work of this module Pod::Simple::BlackBox is doing the dirty work
16# of parsing Pod into treelets (generally one per non-verbatim paragraph), and
17# to call the proper callbacks on the treelets.
18#
19# Every node in a treelet is a ['name', {attrhash}, ...children...]
20
21use integer; # vroom!
22use strict;
23use Carp ();
24use vars qw($VERSION );
25$VERSION = '3.35';
26#use constant DEBUG => 7;
27BEGIN {
28  require Pod::Simple;
29  *DEBUG = \&Pod::Simple::DEBUG unless defined &DEBUG
30}
31
32# Matches a character iff the character will have a different meaning
33# if we choose CP1252 vs UTF-8 if there is no =encoding line.
34# This is broken for early Perls on non-ASCII platforms.
35my $non_ascii_re = eval "qr/[[:^ascii:]]/";
36$non_ascii_re = qr/[\x80-\xFF]/ if ! defined $non_ascii_re;
37
38my $utf8_bom;
39if (($] ge 5.007_003)) {
40  $utf8_bom = "\x{FEFF}";
41  utf8::encode($utf8_bom);
42} else {
43  $utf8_bom = "\xEF\xBB\xBF";   # No EBCDIC BOM detection for early Perls.
44}
45
46#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
47
48sub parse_line { shift->parse_lines(@_) } # alias
49
50# - - -  Turn back now!  Run away!  - - -
51
52sub parse_lines {             # Usage: $parser->parse_lines(@lines)
53  # an undef means end-of-stream
54  my $self = shift;
55
56  my $code_handler = $self->{'code_handler'};
57  my $cut_handler  = $self->{'cut_handler'};
58  my $wl_handler   = $self->{'whiteline_handler'};
59  $self->{'line_count'} ||= 0;
60
61  my $scratch;
62
63  DEBUG > 4 and
64   print STDERR "# Parsing starting at line ", $self->{'line_count'}, ".\n";
65
66  DEBUG > 5 and
67   print STDERR "#  About to parse lines: ",
68     join(' ', map defined($_) ? "[$_]" : "EOF", @_), "\n";
69
70  my $paras = ($self->{'paras'} ||= []);
71   # paragraph buffer.  Because we need to defer processing of =over
72   # directives and verbatim paragraphs.  We call _ponder_paragraph_buffer
73   # to process this.
74
75  $self->{'pod_para_count'} ||= 0;
76
77  my $line;
78  foreach my $source_line (@_) {
79    if( $self->{'source_dead'} ) {
80      DEBUG > 4 and print STDERR "# Source is dead.\n";
81      last;
82    }
83
84    unless( defined $source_line ) {
85      DEBUG > 4 and print STDERR "# Undef-line seen.\n";
86
87      push @$paras, ['~end', {'start_line' => $self->{'line_count'}}];
88      push @$paras, $paras->[-1], $paras->[-1];
89       # So that it definitely fills the buffer.
90      $self->{'source_dead'} = 1;
91      $self->_ponder_paragraph_buffer;
92      next;
93    }
94
95
96    if( $self->{'line_count'}++ ) {
97      ($line = $source_line) =~ tr/\n\r//d;
98       # If we don't have two vars, we'll end up with that there
99       # tr/// modding the (potentially read-only) original source line!
100
101    } else {
102      DEBUG > 2 and print STDERR "First line: [$source_line]\n";
103
104      if( ($line = $source_line) =~ s/^$utf8_bom//s ) {
105        DEBUG and print STDERR "UTF-8 BOM seen.  Faking a '=encoding utf8'.\n";
106        $self->_handle_encoding_line( "=encoding utf8" );
107        delete $self->{'_processed_encoding'};
108        $line =~ tr/\n\r//d;
109
110      } elsif( $line =~ s/^\xFE\xFF//s ) {
111        DEBUG and print STDERR "Big-endian UTF-16 BOM seen.  Aborting parsing.\n";
112        $self->scream(
113          $self->{'line_count'},
114          "UTF16-BE Byte Encoding Mark found; but Pod::Simple v$Pod::Simple::VERSION doesn't implement UTF16 yet."
115        );
116        splice @_;
117        push @_, undef;
118        next;
119
120        # TODO: implement somehow?
121
122      } elsif( $line =~ s/^\xFF\xFE//s ) {
123        DEBUG and print STDERR "Little-endian UTF-16 BOM seen.  Aborting parsing.\n";
124        $self->scream(
125          $self->{'line_count'},
126          "UTF16-LE Byte Encoding Mark found; but Pod::Simple v$Pod::Simple::VERSION doesn't implement UTF16 yet."
127        );
128        splice @_;
129        push @_, undef;
130        next;
131
132        # TODO: implement somehow?
133
134      } else {
135        DEBUG > 2 and print STDERR "First line is BOM-less.\n";
136        ($line = $source_line) =~ tr/\n\r//d;
137      }
138    }
139
140    if(!$self->{'parse_characters'} && !$self->{'encoding'}
141      && ($self->{'in_pod'} || $line =~ /^=/s)
142      && $line =~ /$non_ascii_re/
143    ) {
144
145      my $encoding;
146
147      # No =encoding line, and we are at the first line in the input that
148      # contains a non-ascii byte, that is one whose meaning varies depending
149      # on whether the file is encoded in UTF-8 or CP1252, which are the two
150      # possibilities permitted by the pod spec.  (ASCII is assumed if the
151      # file only contains ASCII bytes.)  In order to process this line, we
152      # need to figure out what encoding we will use for the file.
153      #
154      # Strictly speaking ISO 8859-1 (Latin 1) refers to the code points
155      # 160-255, but it is used here, as it often colloquially is, to refer to
156      # the complete set of code points 0-255, including ASCII (0-127), the C1
157      # controls (128-159), and strict Latin 1 (160-255).
158      #
159      # CP1252 is effectively a superset of Latin 1, because it differs only
160      # from colloquial 8859-1 in the C1 controls, which are very unlikely to
161      # actually be present in 8859-1 files, so can be used for other purposes
162      # without conflict.  CP 1252 uses most of them for graphic characters.
163      #
164      # Note that all ASCII-range bytes represent their corresponding code
165      # points in CP1252 and UTF-8.  In ASCII platform UTF-8 all other code
166      # points require multiple (non-ASCII) bytes to represent.  (A separate
167      # paragraph for EBCDIC is below.)  The multi-byte representation is
168      # quite structured.  If we find an isolated byte that requires multiple
169      # bytes to represent in UTF-8, we know that the encoding is not UTF-8.
170      # If we find a sequence of bytes that violates the UTF-8 structure, we
171      # also can presume the encoding isn't UTF-8, and hence must be 1252.
172      #
173      # But there are ambiguous cases where we could guess wrong.  If so, the
174      # user will end up having to supply an =encoding line.  We use all
175      # readily available information to improve our chances of guessing
176      # right.  The odds of something not being UTF-8, but still passing a
177      # UTF-8 validity test go down very rapidly with increasing length of the
178      # sequence.  Therefore we look at all the maximal length non-ascii
179      # sequences on the line.  If any of the sequences can't be UTF-8, we
180      # quit there and choose CP1252.  If all could be UTF-8, we guess UTF-8.
181      #
182      # On EBCDIC platforms, the situation is somewhat different.  In
183      # UTF-EBCDIC, not only do ASCII-range bytes represent their code points,
184      # but so do the bytes that are for the C1 controls.  Recall that these
185      # correspond to the unused portion of 8859-1 that 1252 mostly takes
186      # over.  That means that there are fewer code points that are
187      # represented by multi-bytes.  But, note that the these controls are
188      # very unlikely to be in pod text.  So if we encounter one of them, it
189      # means that it is quite likely CP1252 and not UTF-8.  The net result is
190      # the same code below is used for both platforms.
191      while ($line =~ m/($non_ascii_re+)/g) {
192        my $non_ascii_seq = $1;
193
194        if (length $non_ascii_seq == 1) {
195          $encoding = 'CP1252';
196          goto guessed;
197        } elsif ($] ge 5.007_003) {
198
199          # On Perls that have this function, we can see if the sequence is
200          # valid UTF-8 or not.
201          my $is_utf8;
202          {
203            no warnings 'utf8';
204            $is_utf8 = utf8::decode($non_ascii_seq);
205          }
206          if (! $is_utf8) {
207            $encoding = 'CP1252';
208            goto guessed;
209          }
210        } elsif (ord("A") == 65) {  # An early Perl, ASCII platform
211
212          # Without utf8::decode, it's a lot harder to do a rigorous check
213          # (though some early releases had a different function that
214          # accomplished the same thing).  Since these are ancient Perls, not
215          # likely to be in use today, we take the easy way out, and look at
216          # just the first two bytes of the sequence to see if they are the
217          # start of a UTF-8 character.  In ASCII UTF-8, continuation bytes
218          # must be between 0x80 and 0xBF.  Start bytes can range from 0xC2
219          # through 0xFF, but anything above 0xF4 is not Unicode, and hence
220          # extremely unlikely to be in a pod.
221          if ($non_ascii_seq !~ /^[\xC2-\xF4][\x80-\xBF]/) {
222            $encoding = 'CP1252';
223            goto guessed;
224          }
225
226          # We don't bother doing anything special for EBCDIC on early Perls.
227          # If there is a solitary variant, CP1252 will be chosen; otherwise
228          # UTF-8.
229        }
230      } # End of loop through all variant sequences on the line
231
232      # All sequences in the line could be UTF-8.  Guess that.
233      $encoding = 'UTF-8';
234
235    guessed:
236      $self->_handle_encoding_line( "=encoding $encoding" );
237      delete $self->{'_processed_encoding'};
238      $self->{'_transcoder'} && $self->{'_transcoder'}->($line);
239
240      my ($word) = $line =~ /(\S*$non_ascii_re\S*)/;
241
242      $self->whine(
243        $self->{'line_count'},
244        "Non-ASCII character seen before =encoding in '$word'. Assuming $encoding"
245      );
246    }
247
248    DEBUG > 5 and print STDERR "# Parsing line: [$line]\n";
249
250    if(!$self->{'in_pod'}) {
251      if($line =~ m/^=([a-zA-Z][a-zA-Z0-9]*)(?:\s|$)/s) {
252        if($1 eq 'cut') {
253          $self->scream(
254            $self->{'line_count'},
255            "=cut found outside a pod block.  Skipping to next block."
256          );
257
258          ## Before there were errata sections in the world, it was
259          ## least-pessimal to abort processing the file.  But now we can
260          ## just barrel on thru (but still not start a pod block).
261          #splice @_;
262          #push @_, undef;
263
264          next;
265        } else {
266          $self->{'in_pod'} = $self->{'start_of_pod_block'}
267                            = $self->{'last_was_blank'}     = 1;
268          # And fall thru to the pod-mode block further down
269        }
270      } else {
271        DEBUG > 5 and print STDERR "# It's a code-line.\n";
272        $code_handler->(map $_, $line, $self->{'line_count'}, $self)
273         if $code_handler;
274        # Note: this may cause code to be processed out of order relative
275        #  to pods, but in order relative to cuts.
276
277        # Note also that we haven't yet applied the transcoding to $line
278        #  by time we call $code_handler!
279
280        if( $line =~ m/^#\s*line\s+(\d+)\s*(?:\s"([^"]+)")?\s*$/ ) {
281          # That RE is from perlsyn, section "Plain Old Comments (Not!)",
282          #$fname = $2 if defined $2;
283          #DEBUG > 1 and defined $2 and print STDERR "# Setting fname to \"$fname\"\n";
284          DEBUG > 1 and print STDERR "# Setting nextline to $1\n";
285          $self->{'line_count'} = $1 - 1;
286        }
287
288        next;
289      }
290    }
291
292    # . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
293    # Else we're in pod mode:
294
295    # Apply any necessary transcoding:
296    $self->{'_transcoder'} && $self->{'_transcoder'}->($line);
297
298    # HERE WE CATCH =encoding EARLY!
299    if( $line =~ m/^=encoding\s+\S+\s*$/s ) {
300      next if $self->parse_characters;   # Ignore this line
301      $line = $self->_handle_encoding_line( $line );
302    }
303
304    if($line =~ m/^=cut/s) {
305      # here ends the pod block, and therefore the previous pod para
306      DEBUG > 1 and print STDERR "Noting =cut at line ${$self}{'line_count'}\n";
307      $self->{'in_pod'} = 0;
308      # ++$self->{'pod_para_count'};
309      $self->_ponder_paragraph_buffer();
310       # by now it's safe to consider the previous paragraph as done.
311      $cut_handler->(map $_, $line, $self->{'line_count'}, $self)
312       if $cut_handler;
313
314      # TODO: add to docs: Note: this may cause cuts to be processed out
315      #  of order relative to pods, but in order relative to code.
316
317    } elsif($line =~ m/^(\s*)$/s) {  # it's a blank line
318      if (defined $1 and $1 =~ /[^\S\r\n]/) { # it's a white line
319        $wl_handler->(map $_, $line, $self->{'line_count'}, $self)
320          if $wl_handler;
321      }
322
323      if(!$self->{'start_of_pod_block'} and @$paras and $paras->[-1][0] eq '~Verbatim') {
324        DEBUG > 1 and print STDERR "Saving blank line at line ${$self}{'line_count'}\n";
325        push @{$paras->[-1]}, $line;
326      }  # otherwise it's not interesting
327
328      if(!$self->{'start_of_pod_block'} and !$self->{'last_was_blank'}) {
329        DEBUG > 1 and print STDERR "Noting para ends with blank line at ${$self}{'line_count'}\n";
330      }
331
332      $self->{'last_was_blank'} = 1;
333
334    } elsif($self->{'last_was_blank'}) {  # A non-blank line starting a new para...
335
336      if($line =~ m/^(=[a-zA-Z][a-zA-Z0-9]*)(?:\s+|$)(.*)/s) {
337        # THIS IS THE ONE PLACE WHERE WE CONSTRUCT NEW DIRECTIVE OBJECTS
338        my $new = [$1, {'start_line' => $self->{'line_count'}}, $2];
339         # Note that in "=head1 foo", the WS is lost.
340         # Example: ['=head1', {'start_line' => 123}, ' foo']
341
342        ++$self->{'pod_para_count'};
343
344        $self->_ponder_paragraph_buffer();
345         # by now it's safe to consider the previous paragraph as done.
346
347        push @$paras, $new; # the new incipient paragraph
348        DEBUG > 1 and print STDERR "Starting new ${$paras}[-1][0] para at line ${$self}{'line_count'}\n";
349
350      } elsif($line =~ m/^\s/s) {
351
352        if(!$self->{'start_of_pod_block'} and @$paras and $paras->[-1][0] eq '~Verbatim') {
353          DEBUG > 1 and print STDERR "Resuming verbatim para at line ${$self}{'line_count'}\n";
354          push @{$paras->[-1]}, $line;
355        } else {
356          ++$self->{'pod_para_count'};
357          $self->_ponder_paragraph_buffer();
358           # by now it's safe to consider the previous paragraph as done.
359          DEBUG > 1 and print STDERR "Starting verbatim para at line ${$self}{'line_count'}\n";
360          push @$paras, ['~Verbatim', {'start_line' => $self->{'line_count'}}, $line];
361        }
362      } else {
363        ++$self->{'pod_para_count'};
364        $self->_ponder_paragraph_buffer();
365         # by now it's safe to consider the previous paragraph as done.
366        push @$paras, ['~Para',  {'start_line' => $self->{'line_count'}}, $line];
367        DEBUG > 1 and print STDERR "Starting plain para at line ${$self}{'line_count'}\n";
368      }
369      $self->{'last_was_blank'} = $self->{'start_of_pod_block'} = 0;
370
371    } else {
372      # It's a non-blank line /continuing/ the current para
373      if(@$paras) {
374        DEBUG > 2 and print STDERR "Line ${$self}{'line_count'} continues current paragraph\n";
375        push @{$paras->[-1]}, $line;
376      } else {
377        # Unexpected case!
378        die "Continuing a paragraph but \@\$paras is empty?";
379      }
380      $self->{'last_was_blank'} = $self->{'start_of_pod_block'} = 0;
381    }
382
383  } # ends the big while loop
384
385  DEBUG > 1 and print STDERR (pretty(@$paras), "\n");
386  return $self;
387}
388
389#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
390
391sub _handle_encoding_line {
392  my($self, $line) = @_;
393
394  return if $self->parse_characters;
395
396  # The point of this routine is to set $self->{'_transcoder'} as indicated.
397
398  return $line unless $line =~ m/^=encoding\s+(\S+)\s*$/s;
399  DEBUG > 1 and print STDERR "Found an encoding line \"=encoding $1\"\n";
400
401  my $e    = $1;
402  my $orig = $e;
403  push @{ $self->{'encoding_command_reqs'} }, "=encoding $orig";
404
405  my $enc_error;
406
407  # Cf.   perldoc Encode   and   perldoc Encode::Supported
408
409  require Pod::Simple::Transcode;
410
411  if( $self->{'encoding'} ) {
412    my $norm_current = $self->{'encoding'};
413    my $norm_e = $e;
414    foreach my $that ($norm_current, $norm_e) {
415      $that =  lc($that);
416      $that =~ s/[-_]//g;
417    }
418    if($norm_current eq $norm_e) {
419      DEBUG > 1 and print STDERR "The '=encoding $orig' line is ",
420       "redundant.  ($norm_current eq $norm_e).  Ignoring.\n";
421      $enc_error = '';
422       # But that doesn't necessarily mean that the earlier one went okay
423    } else {
424      $enc_error = "Encoding is already set to " . $self->{'encoding'};
425      DEBUG > 1 and print STDERR $enc_error;
426    }
427  } elsif (
428    # OK, let's turn on the encoding
429    do {
430      DEBUG > 1 and print STDERR " Setting encoding to $e\n";
431      $self->{'encoding'} = $e;
432      1;
433    }
434    and $e eq 'HACKRAW'
435  ) {
436    DEBUG and print STDERR " Putting in HACKRAW (no-op) encoding mode.\n";
437
438  } elsif( Pod::Simple::Transcode::->encoding_is_available($e) ) {
439
440    die($enc_error = "WHAT? _transcoder is already set?!")
441     if $self->{'_transcoder'};   # should never happen
442    require Pod::Simple::Transcode;
443    $self->{'_transcoder'} = Pod::Simple::Transcode::->make_transcoder($e);
444    eval {
445      my @x = ('', "abc", "123");
446      $self->{'_transcoder'}->(@x);
447    };
448    $@ && die( $enc_error =
449      "Really unexpected error setting up encoding $e: $@\nAborting"
450    );
451    $self->{'detected_encoding'} = $e;
452
453  } else {
454    my @supported = Pod::Simple::Transcode::->all_encodings;
455
456    # Note unsupported, and complain
457    DEBUG and print STDERR " Encoding [$e] is unsupported.",
458      "\nSupporteds: @supported\n";
459    my $suggestion = '';
460
461    # Look for a near match:
462    my $norm = lc($e);
463    $norm =~ tr[-_][]d;
464    my $n;
465    foreach my $enc (@supported) {
466      $n = lc($enc);
467      $n =~ tr[-_][]d;
468      next unless $n eq $norm;
469      $suggestion = "  (Maybe \"$e\" should be \"$enc\"?)";
470      last;
471    }
472    my $encmodver = Pod::Simple::Transcode::->encmodver;
473    $enc_error = join '' =>
474      "This document probably does not appear as it should, because its ",
475      "\"=encoding $e\" line calls for an unsupported encoding.",
476      $suggestion, "  [$encmodver\'s supported encodings are: @supported]"
477    ;
478
479    $self->scream( $self->{'line_count'}, $enc_error );
480  }
481  push @{ $self->{'encoding_command_statuses'} }, $enc_error;
482  if (defined($self->{'_processed_encoding'})) {
483    # Double declaration.
484    $self->scream( $self->{'line_count'}, 'Cannot have multiple =encoding directives');
485  }
486  $self->{'_processed_encoding'} = $orig;
487
488  return $line;
489}
490
491# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
492
493sub _handle_encoding_second_level {
494  # By time this is called, the encoding (if well formed) will already
495  #  have been acted one.
496  my($self, $para) = @_;
497  my @x = @$para;
498  my $content = join ' ', splice @x, 2;
499  $content =~ s/^\s+//s;
500  $content =~ s/\s+$//s;
501
502  DEBUG > 2 and print STDERR "Ogling encoding directive: =encoding $content\n";
503
504  if (defined($self->{'_processed_encoding'})) {
505    #if($content ne $self->{'_processed_encoding'}) {
506    #  Could it happen?
507    #}
508    delete $self->{'_processed_encoding'};
509    # It's already been handled.  Check for errors.
510    if(! $self->{'encoding_command_statuses'} ) {
511      DEBUG > 2 and print STDERR " CRAZY ERROR: It wasn't really handled?!\n";
512    } elsif( $self->{'encoding_command_statuses'}[-1] ) {
513      $self->whine( $para->[1]{'start_line'},
514        sprintf "Couldn't do %s: %s",
515          $self->{'encoding_command_reqs'  }[-1],
516          $self->{'encoding_command_statuses'}[-1],
517      );
518    } else {
519      DEBUG > 2 and print STDERR " (Yup, it was successfully handled already.)\n";
520    }
521
522  } else {
523    # Otherwise it's a syntax error
524    $self->whine( $para->[1]{'start_line'},
525      "Invalid =encoding syntax: $content"
526    );
527  }
528
529  return;
530}
531
532#~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`
533
534{
535my $m = -321;   # magic line number
536
537sub _gen_errata {
538  my $self = $_[0];
539  # Return 0 or more fake-o paragraphs explaining the accumulated
540  #  errors on this document.
541
542  return() unless $self->{'errata'} and keys %{$self->{'errata'}};
543
544  my @out;
545
546  foreach my $line (sort {$a <=> $b} keys %{$self->{'errata'}}) {
547    push @out,
548      ['=item', {'start_line' => $m}, "Around line $line:"],
549      map( ['~Para', {'start_line' => $m, '~cooked' => 1},
550        #['~Top', {'start_line' => $m},
551        $_
552        #]
553        ],
554        @{$self->{'errata'}{$line}}
555      )
556    ;
557  }
558
559  # TODO: report of unknown entities? unrenderable characters?
560
561  unshift @out,
562    ['=head1', {'start_line' => $m, 'errata' => 1}, 'POD ERRORS'],
563    ['~Para', {'start_line' => $m, '~cooked' => 1, 'errata' => 1},
564     "Hey! ",
565     ['B', {},
566      'The above document had some coding errors, which are explained below:'
567     ]
568    ],
569    ['=over',  {'start_line' => $m, 'errata' => 1}, ''],
570  ;
571
572  push @out,
573    ['=back',  {'start_line' => $m, 'errata' => 1}, ''],
574  ;
575
576  DEBUG and print STDERR "\n<<\n", pretty(\@out), "\n>>\n\n";
577
578  return @out;
579}
580
581}
582
583#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
584
585##############################################################################
586##
587##  stop reading now stop reading now stop reading now stop reading now stop
588##
589##                         HERE IT BECOMES REALLY SCARY
590##
591##  stop reading now stop reading now stop reading now stop reading now stop
592##
593##############################################################################
594
595sub _ponder_paragraph_buffer {
596
597  # Para-token types as found in the buffer.
598  #   ~Verbatim, ~Para, ~end, =head1..4, =for, =begin, =end,
599  #   =over, =back, =item
600  #   and the null =pod (to be complained about if over one line)
601  #
602  # "~data" paragraphs are something we generate at this level, depending on
603  # a currently open =over region
604
605  # Events fired:  Begin and end for:
606  #                   directivename (like head1 .. head4), item, extend,
607  #                   for (from =begin...=end, =for),
608  #                   over-bullet, over-number, over-text, over-block,
609  #                   item-bullet, item-number, item-text,
610  #                   Document,
611  #                   Data, Para, Verbatim
612  #                   B, C, longdirname (TODO -- wha?), etc. for all directives
613  #
614
615  my $self = $_[0];
616  my $paras;
617  return unless @{$paras = $self->{'paras'}};
618  my $curr_open = ($self->{'curr_open'} ||= []);
619
620  my $scratch;
621
622  DEBUG > 10 and print STDERR "# Paragraph buffer: <<", pretty($paras), ">>\n";
623
624  # We have something in our buffer.  So apparently the document has started.
625  unless($self->{'doc_has_started'}) {
626    $self->{'doc_has_started'} = 1;
627
628    my $starting_contentless;
629    $starting_contentless =
630     (
631       !@$curr_open
632       and @$paras and ! grep $_->[0] ne '~end', @$paras
633        # i.e., if the paras is all ~ends
634     )
635    ;
636    DEBUG and print STDERR "# Starting ",
637      $starting_contentless ? 'contentless' : 'contentful',
638      " document\n"
639    ;
640
641    $self->_handle_element_start(
642      ($scratch = 'Document'),
643      {
644        'start_line' => $paras->[0][1]{'start_line'},
645        $starting_contentless ? ( 'contentless' => 1 ) : (),
646      },
647    );
648  }
649
650  my($para, $para_type);
651  while(@$paras) {
652    last if @$paras == 1 and
653      ( $paras->[0][0] eq '=over' or $paras->[0][0] eq '~Verbatim'
654        or $paras->[0][0] eq '=item' )
655    ;
656    # Those're the three kinds of paragraphs that require lookahead.
657    #   Actually, an "=item Foo" inside an <over type=text> region
658    #   and any =item inside an <over type=block> region (rare)
659    #   don't require any lookahead, but all others (bullets
660    #   and numbers) do.
661
662# TODO: whinge about many kinds of directives in non-resolving =for regions?
663# TODO: many?  like what?  =head1 etc?
664
665    $para = shift @$paras;
666    $para_type = $para->[0];
667
668    DEBUG > 1 and print STDERR "Pondering a $para_type paragraph, given the stack: (",
669      $self->_dump_curr_open(), ")\n";
670
671    if($para_type eq '=for') {
672      next if $self->_ponder_for($para,$curr_open,$paras);
673
674    } elsif($para_type eq '=begin') {
675      next if $self->_ponder_begin($para,$curr_open,$paras);
676
677    } elsif($para_type eq '=end') {
678      next if $self->_ponder_end($para,$curr_open,$paras);
679
680    } elsif($para_type eq '~end') { # The virtual end-document signal
681      next if $self->_ponder_doc_end($para,$curr_open,$paras);
682    }
683
684
685    # ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
686    #~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
687    if(grep $_->[1]{'~ignore'}, @$curr_open) {
688      DEBUG > 1 and
689       print STDERR "Skipping $para_type paragraph because in ignore mode.\n";
690      next;
691    }
692    #~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
693    # ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
694
695    if($para_type eq '=pod') {
696      $self->_ponder_pod($para,$curr_open,$paras);
697
698    } elsif($para_type eq '=over') {
699      next if $self->_ponder_over($para,$curr_open,$paras);
700
701    } elsif($para_type eq '=back') {
702      next if $self->_ponder_back($para,$curr_open,$paras);
703
704    } else {
705
706      # All non-magical codes!!!
707
708      # Here we start using $para_type for our own twisted purposes, to
709      #  mean how it should get treated, not as what the element name
710      #  should be.
711
712      DEBUG > 1 and print STDERR "Pondering non-magical $para_type\n";
713
714      my $i;
715
716      # Enforce some =headN discipline
717      if($para_type =~ m/^=head\d$/s
718         and ! $self->{'accept_heads_anywhere'}
719         and @$curr_open
720         and $curr_open->[-1][0] eq '=over'
721      ) {
722        DEBUG > 2 and print STDERR "'=$para_type' inside an '=over'!\n";
723        $self->whine(
724          $para->[1]{'start_line'},
725          "You forgot a '=back' before '$para_type'"
726        );
727        unshift @$paras, ['=back', {}, ''], $para;   # close the =over
728        next;
729      }
730
731
732      if($para_type eq '=item') {
733
734        my $over;
735        unless(@$curr_open and
736               $over = (grep { $_->[0] eq '=over' } @$curr_open)[-1]) {
737          $self->whine(
738            $para->[1]{'start_line'},
739            "'=item' outside of any '=over'"
740          );
741          unshift @$paras,
742            ['=over', {'start_line' => $para->[1]{'start_line'}}, ''],
743            $para
744          ;
745          next;
746        }
747
748
749        my $over_type = $over->[1]{'~type'};
750
751        if(!$over_type) {
752          # Shouldn't happen1
753          die "Typeless over in stack, starting at line "
754           . $over->[1]{'start_line'};
755
756        } elsif($over_type eq 'block') {
757          unless($curr_open->[-1][1]{'~bitched_about'}) {
758            $curr_open->[-1][1]{'~bitched_about'} = 1;
759            $self->whine(
760              $curr_open->[-1][1]{'start_line'},
761              "You can't have =items (as at line "
762              . $para->[1]{'start_line'}
763              . ") unless the first thing after the =over is an =item"
764            );
765          }
766          # Just turn it into a paragraph and reconsider it
767          $para->[0] = '~Para';
768          unshift @$paras, $para;
769          next;
770
771        } elsif($over_type eq 'text') {
772          my $item_type = $self->_get_item_type($para);
773            # That kills the content of the item if it's a number or bullet.
774          DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n";
775
776          if($item_type eq 'text') {
777            # Nothing special needs doing for 'text'
778          } elsif($item_type eq 'number' or $item_type eq 'bullet') {
779            $self->whine(
780              $para->[1]{'start_line'},
781              "Expected text after =item, not a $item_type"
782            );
783            # Undo our clobbering:
784            push @$para, $para->[1]{'~orig_content'};
785            delete $para->[1]{'number'};
786             # Only a PROPER item-number element is allowed
787             #  to have a number attribute.
788          } else {
789            die "Unhandled item type $item_type"; # should never happen
790          }
791
792          # =item-text thingies don't need any assimilation, it seems.
793
794        } elsif($over_type eq 'number') {
795          my $item_type = $self->_get_item_type($para);
796            # That kills the content of the item if it's a number or bullet.
797          DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n";
798
799          my $expected_value = ++ $curr_open->[-1][1]{'~counter'};
800
801          if($item_type eq 'bullet') {
802            # Hm, it's not numeric.  Correct for this.
803            $para->[1]{'number'} = $expected_value;
804            $self->whine(
805              $para->[1]{'start_line'},
806              "Expected '=item $expected_value'"
807            );
808            push @$para, $para->[1]{'~orig_content'};
809              # restore the bullet, blocking the assimilation of next para
810
811          } elsif($item_type eq 'text') {
812            # Hm, it's not numeric.  Correct for this.
813            $para->[1]{'number'} = $expected_value;
814            $self->whine(
815              $para->[1]{'start_line'},
816              "Expected '=item $expected_value'"
817            );
818            # Text content will still be there and will block next ~Para
819
820          } elsif($item_type ne 'number') {
821            die "Unknown item type $item_type"; # should never happen
822
823          } elsif($expected_value == $para->[1]{'number'}) {
824            DEBUG > 1 and print STDERR " Numeric item has the expected value of $expected_value\n";
825
826          } else {
827            DEBUG > 1 and print STDERR " Numeric item has ", $para->[1]{'number'},
828             " instead of the expected value of $expected_value\n";
829            $self->whine(
830              $para->[1]{'start_line'},
831              "You have '=item " . $para->[1]{'number'} .
832              "' instead of the expected '=item $expected_value'"
833            );
834            $para->[1]{'number'} = $expected_value;  # correcting!!
835          }
836
837          if(@$para == 2) {
838            # For the cases where we /didn't/ push to @$para
839            if($paras->[0][0] eq '~Para') {
840              DEBUG and print STDERR "Assimilating following ~Para content into $over_type item\n";
841              push @$para, splice @{shift @$paras},2;
842            } else {
843              DEBUG and print STDERR "Can't assimilate following ", $paras->[0][0], "\n";
844              push @$para, '';  # Just so it's not contentless
845            }
846          }
847
848
849        } elsif($over_type eq 'bullet') {
850          my $item_type = $self->_get_item_type($para);
851            # That kills the content of the item if it's a number or bullet.
852          DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n";
853
854          if($item_type eq 'bullet') {
855            # as expected!
856
857            if( $para->[1]{'~_freaky_para_hack'} ) {
858              DEBUG and print STDERR "Accomodating '=item * Foo' tolerance hack.\n";
859              push @$para, delete $para->[1]{'~_freaky_para_hack'};
860            }
861
862          } elsif($item_type eq 'number') {
863            $self->whine(
864              $para->[1]{'start_line'},
865              "Expected '=item *'"
866            );
867            push @$para, $para->[1]{'~orig_content'};
868             # and block assimilation of the next paragraph
869            delete $para->[1]{'number'};
870             # Only a PROPER item-number element is allowed
871             #  to have a number attribute.
872          } elsif($item_type eq 'text') {
873            $self->whine(
874              $para->[1]{'start_line'},
875              "Expected '=item *'"
876            );
877             # But doesn't need processing.  But it'll block assimilation
878             #  of the next para.
879          } else {
880            die "Unhandled item type $item_type"; # should never happen
881          }
882
883          if(@$para == 2) {
884            # For the cases where we /didn't/ push to @$para
885            if($paras->[0][0] eq '~Para') {
886              DEBUG and print STDERR "Assimilating following ~Para content into $over_type item\n";
887              push @$para, splice @{shift @$paras},2;
888            } else {
889              DEBUG and print STDERR "Can't assimilate following ", $paras->[0][0], "\n";
890              push @$para, '';  # Just so it's not contentless
891            }
892          }
893
894        } else {
895          die "Unhandled =over type \"$over_type\"?";
896          # Shouldn't happen!
897        }
898
899        $para_type = 'Plain';
900        $para->[0] .= '-' . $over_type;
901        # Whew.  Now fall thru and process it.
902
903
904      } elsif($para_type eq '=extend') {
905        # Well, might as well implement it here.
906        $self->_ponder_extend($para);
907        next;  # and skip
908      } elsif($para_type eq '=encoding') {
909        # Not actually acted on here, but we catch errors here.
910        $self->_handle_encoding_second_level($para);
911        next unless $self->keep_encoding_directive;
912        $para_type = 'Plain';
913      } elsif($para_type eq '~Verbatim') {
914        $para->[0] = 'Verbatim';
915        $para_type = '?Verbatim';
916      } elsif($para_type eq '~Para') {
917        $para->[0] = 'Para';
918        $para_type = '?Plain';
919      } elsif($para_type eq 'Data') {
920        $para->[0] = 'Data';
921        $para_type = '?Data';
922      } elsif( $para_type =~ s/^=//s
923        and defined( $para_type = $self->{'accept_directives'}{$para_type} )
924      ) {
925        DEBUG > 1 and print STDERR " Pondering known directive ${$para}[0] as $para_type\n";
926      } else {
927        # An unknown directive!
928        DEBUG > 1 and printf STDERR "Unhandled directive %s (Handled: %s)\n",
929         $para->[0], join(' ', sort keys %{$self->{'accept_directives'}} )
930        ;
931        $self->whine(
932          $para->[1]{'start_line'},
933          "Unknown directive: $para->[0]"
934        );
935
936        # And maybe treat it as text instead of just letting it go?
937        next;
938      }
939
940      if($para_type =~ s/^\?//s) {
941        if(! @$curr_open) {  # usual case
942          DEBUG and print STDERR "Treating $para_type paragraph as such because stack is empty.\n";
943        } else {
944          my @fors = grep $_->[0] eq '=for', @$curr_open;
945          DEBUG > 1 and print STDERR "Containing fors: ",
946            join(',', map $_->[1]{'target'}, @fors), "\n";
947
948          if(! @fors) {
949            DEBUG and print STDERR "Treating $para_type paragraph as such because stack has no =for's\n";
950
951          #} elsif(grep $_->[1]{'~resolve'}, @fors) {
952          #} elsif(not grep !$_->[1]{'~resolve'}, @fors) {
953          } elsif( $fors[-1][1]{'~resolve'} ) {
954            # Look to the immediately containing for
955
956            if($para_type eq 'Data') {
957              DEBUG and print STDERR "Treating Data paragraph as Plain/Verbatim because the containing =for ($fors[-1][1]{'target'}) is a resolver\n";
958              $para->[0] = 'Para';
959              $para_type = 'Plain';
960            } else {
961              DEBUG and print STDERR "Treating $para_type paragraph as such because the containing =for ($fors[-1][1]{'target'}) is a resolver\n";
962            }
963          } else {
964            DEBUG and print STDERR "Treating $para_type paragraph as Data because the containing =for ($fors[-1][1]{'target'}) is a non-resolver\n";
965            $para->[0] = $para_type = 'Data';
966          }
967        }
968      }
969
970      #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
971      if($para_type eq 'Plain') {
972        $self->_ponder_Plain($para);
973      } elsif($para_type eq 'Verbatim') {
974        $self->_ponder_Verbatim($para);
975      } elsif($para_type eq 'Data') {
976        $self->_ponder_Data($para);
977      } else {
978        die "\$para type is $para_type -- how did that happen?";
979        # Shouldn't happen.
980      }
981
982      #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
983      $para->[0] =~ s/^[~=]//s;
984
985      DEBUG and print STDERR "\n", pretty($para), "\n";
986
987      # traverse the treelet (which might well be just one string scalar)
988      $self->{'content_seen'} ||= 1;
989      $self->_traverse_treelet_bit(@$para);
990    }
991  }
992
993  return;
994}
995
996###########################################################################
997# The sub-ponderers...
998
999
1000
1001sub _ponder_for {
1002  my ($self,$para,$curr_open,$paras) = @_;
1003
1004  # Fake it out as a begin/end
1005  my $target;
1006
1007  if(grep $_->[1]{'~ignore'}, @$curr_open) {
1008    DEBUG > 1 and print STDERR "Ignoring ignorable =for\n";
1009    return 1;
1010  }
1011
1012  for(my $i = 2; $i < @$para; ++$i) {
1013    if($para->[$i] =~ s/^\s*(\S+)\s*//s) {
1014      $target = $1;
1015      last;
1016    }
1017  }
1018  unless(defined $target) {
1019    $self->whine(
1020      $para->[1]{'start_line'},
1021      "=for without a target?"
1022    );
1023    return 1;
1024  }
1025  DEBUG > 1 and
1026   print STDERR "Faking out a =for $target as a =begin $target / =end $target\n";
1027
1028  $para->[0] = 'Data';
1029
1030  unshift @$paras,
1031    ['=begin',
1032      {'start_line' => $para->[1]{'start_line'}, '~really' => '=for'},
1033      $target,
1034    ],
1035    $para,
1036    ['=end',
1037      {'start_line' => $para->[1]{'start_line'}, '~really' => '=for'},
1038      $target,
1039    ],
1040  ;
1041
1042  return 1;
1043}
1044
1045sub _ponder_begin {
1046  my ($self,$para,$curr_open,$paras) = @_;
1047  my $content = join ' ', splice @$para, 2;
1048  $content =~ s/^\s+//s;
1049  $content =~ s/\s+$//s;
1050  unless(length($content)) {
1051    $self->whine(
1052      $para->[1]{'start_line'},
1053      "=begin without a target?"
1054    );
1055    DEBUG and print STDERR "Ignoring targetless =begin\n";
1056    return 1;
1057  }
1058
1059  my ($target, $title) = $content =~ m/^(\S+)\s*(.*)$/;
1060  $para->[1]{'title'} = $title if ($title);
1061  $para->[1]{'target'} = $target;  # without any ':'
1062  $content = $target; # strip off the title
1063
1064  $content =~ s/^:!/!:/s;
1065  my $neg;  # whether this is a negation-match
1066  $neg = 1        if $content =~ s/^!//s;
1067  my $to_resolve;  # whether to process formatting codes
1068  $to_resolve = 1 if $content =~ s/^://s;
1069
1070  my $dont_ignore; # whether this target matches us
1071
1072  foreach my $target_name (
1073    split(',', $content, -1),
1074    $neg ? () : '*'
1075  ) {
1076    DEBUG > 2 and
1077     print STDERR " Considering whether =begin $content matches $target_name\n";
1078    next unless $self->{'accept_targets'}{$target_name};
1079
1080    DEBUG > 2 and
1081     print STDERR "  It DOES match the acceptable target $target_name!\n";
1082    $to_resolve = 1
1083      if $self->{'accept_targets'}{$target_name} eq 'force_resolve';
1084    $dont_ignore = 1;
1085    $para->[1]{'target_matching'} = $target_name;
1086    last; # stop looking at other target names
1087  }
1088
1089  if($neg) {
1090    if( $dont_ignore ) {
1091      $dont_ignore = '';
1092      delete $para->[1]{'target_matching'};
1093      DEBUG > 2 and print STDERR " But the leading ! means that this is a NON-match!\n";
1094    } else {
1095      $dont_ignore = 1;
1096      $para->[1]{'target_matching'} = '!';
1097      DEBUG > 2 and print STDERR " But the leading ! means that this IS a match!\n";
1098    }
1099  }
1100
1101  $para->[0] = '=for';  # Just what we happen to call these, internally
1102  $para->[1]{'~really'} ||= '=begin';
1103  $para->[1]{'~ignore'}   = (! $dont_ignore) || 0;
1104  $para->[1]{'~resolve'}  = $to_resolve || 0;
1105
1106  DEBUG > 1 and print STDERR " Making note to ", $dont_ignore ? 'not ' : '',
1107    "ignore contents of this region\n";
1108  DEBUG > 1 and $dont_ignore and print STDERR " Making note to treat contents as ",
1109    ($to_resolve ? 'verbatim/plain' : 'data'), " paragraphs\n";
1110  DEBUG > 1 and print STDERR " (Stack now: ", $self->_dump_curr_open(), ")\n";
1111
1112  push @$curr_open, $para;
1113  if(!$dont_ignore or scalar grep $_->[1]{'~ignore'}, @$curr_open) {
1114    DEBUG > 1 and print STDERR "Ignoring ignorable =begin\n";
1115  } else {
1116    $self->{'content_seen'} ||= 1;
1117    $self->_handle_element_start((my $scratch='for'), $para->[1]);
1118  }
1119
1120  return 1;
1121}
1122
1123sub _ponder_end {
1124  my ($self,$para,$curr_open,$paras) = @_;
1125  my $content = join ' ', splice @$para, 2;
1126  $content =~ s/^\s+//s;
1127  $content =~ s/\s+$//s;
1128  DEBUG and print STDERR "Ogling '=end $content' directive\n";
1129
1130  unless(length($content)) {
1131    $self->whine(
1132      $para->[1]{'start_line'},
1133      "'=end' without a target?" . (
1134        ( @$curr_open and $curr_open->[-1][0] eq '=for' )
1135        ? ( " (Should be \"=end " . $curr_open->[-1][1]{'target'} . '")' )
1136        : ''
1137      )
1138    );
1139    DEBUG and print STDERR "Ignoring targetless =end\n";
1140    return 1;
1141  }
1142
1143  unless($content =~ m/^\S+$/) {  # i.e., unless it's one word
1144    $self->whine(
1145      $para->[1]{'start_line'},
1146      "'=end $content' is invalid.  (Stack: "
1147      . $self->_dump_curr_open() . ')'
1148    );
1149    DEBUG and print STDERR "Ignoring mistargetted =end $content\n";
1150    return 1;
1151  }
1152
1153  unless(@$curr_open and $curr_open->[-1][0] eq '=for') {
1154    $self->whine(
1155      $para->[1]{'start_line'},
1156      "=end $content without matching =begin.  (Stack: "
1157      . $self->_dump_curr_open() . ')'
1158    );
1159    DEBUG and print STDERR "Ignoring mistargetted =end $content\n";
1160    return 1;
1161  }
1162
1163  unless($content eq $curr_open->[-1][1]{'target'}) {
1164    $self->whine(
1165      $para->[1]{'start_line'},
1166      "=end $content doesn't match =begin "
1167      . $curr_open->[-1][1]{'target'}
1168      . ".  (Stack: "
1169      . $self->_dump_curr_open() . ')'
1170    );
1171    DEBUG and print STDERR "Ignoring mistargetted =end $content at line $para->[1]{'start_line'}\n";
1172    return 1;
1173  }
1174
1175  # Else it's okay to close...
1176  if(grep $_->[1]{'~ignore'}, @$curr_open) {
1177    DEBUG > 1 and print STDERR "Not firing any event for this =end $content because in an ignored region\n";
1178    # And that may be because of this to-be-closed =for region, or some
1179    #  other one, but it doesn't matter.
1180  } else {
1181    $curr_open->[-1][1]{'start_line'} = $para->[1]{'start_line'};
1182      # what's that for?
1183
1184    $self->{'content_seen'} ||= 1;
1185    $self->_handle_element_end( my $scratch = 'for', $para->[1]);
1186  }
1187  DEBUG > 1 and print STDERR "Popping $curr_open->[-1][0] $curr_open->[-1][1]{'target'} because of =end $content\n";
1188  pop @$curr_open;
1189
1190  return 1;
1191}
1192
1193sub _ponder_doc_end {
1194  my ($self,$para,$curr_open,$paras) = @_;
1195  if(@$curr_open) { # Deal with things left open
1196    DEBUG and print STDERR "Stack is nonempty at end-document: (",
1197      $self->_dump_curr_open(), ")\n";
1198
1199    DEBUG > 9 and print STDERR "Stack: ", pretty($curr_open), "\n";
1200    unshift @$paras, $self->_closers_for_all_curr_open;
1201    # Make sure there is exactly one ~end in the parastack, at the end:
1202    @$paras = grep $_->[0] ne '~end', @$paras;
1203    push @$paras, $para, $para;
1204     # We need two -- once for the next cycle where we
1205     #  generate errata, and then another to be at the end
1206     #  when that loop back around to process the errata.
1207    return 1;
1208
1209  } else {
1210    DEBUG and print STDERR "Okay, stack is empty now.\n";
1211  }
1212
1213  # Try generating errata section, if applicable
1214  unless($self->{'~tried_gen_errata'}) {
1215    $self->{'~tried_gen_errata'} = 1;
1216    my @extras = $self->_gen_errata();
1217    if(@extras) {
1218      unshift @$paras, @extras;
1219      DEBUG and print STDERR "Generated errata... relooping...\n";
1220      return 1;  # I.e., loop around again to process these fake-o paragraphs
1221    }
1222  }
1223
1224  splice @$paras; # Well, that's that for this paragraph buffer.
1225  DEBUG and print STDERR "Throwing end-document event.\n";
1226
1227  $self->_handle_element_end( my $scratch = 'Document' );
1228  return 1; # Hasta la byebye
1229}
1230
1231sub _ponder_pod {
1232  my ($self,$para,$curr_open,$paras) = @_;
1233  $self->whine(
1234    $para->[1]{'start_line'},
1235    "=pod directives shouldn't be over one line long!  Ignoring all "
1236     . (@$para - 2) . " lines of content"
1237  ) if @$para > 3;
1238
1239  # Content ignored unless 'pod_handler' is set
1240  if (my $pod_handler = $self->{'pod_handler'}) {
1241      my ($line_num, $line) = map $_, $para->[1]{'start_line'}, $para->[2];
1242      $line = $line eq '' ? "=pod" : "=pod $line"; # imitate cut_handler output
1243      $pod_handler->($line, $line_num, $self);
1244  }
1245
1246  # The surrounding methods set content_seen, so let us remain consistent.
1247  # I do not know why it was not here before -- should it not be here?
1248  # $self->{'content_seen'} ||= 1;
1249
1250  return;
1251}
1252
1253sub _ponder_over {
1254  my ($self,$para,$curr_open,$paras) = @_;
1255  return 1 unless @$paras;
1256  my $list_type;
1257
1258  if($paras->[0][0] eq '=item') { # most common case
1259    $list_type = $self->_get_initial_item_type($paras->[0]);
1260
1261  } elsif($paras->[0][0] eq '=back') {
1262    # Ignore empty lists by default
1263    if ($self->{'parse_empty_lists'}) {
1264      $list_type = 'empty';
1265    } else {
1266      shift @$paras;
1267      return 1;
1268    }
1269  } elsif($paras->[0][0] eq '~end') {
1270    $self->whine(
1271      $para->[1]{'start_line'},
1272      "=over is the last thing in the document?!"
1273    );
1274    return 1; # But feh, ignore it.
1275  } else {
1276    $list_type = 'block';
1277  }
1278  $para->[1]{'~type'} = $list_type;
1279  push @$curr_open, $para;
1280   # yes, we reuse the paragraph as a stack item
1281
1282  my $content = join ' ', splice @$para, 2;
1283  my $overness;
1284  if($content =~ m/^\s*$/s) {
1285    $para->[1]{'indent'} = 4;
1286  } elsif($content =~ m/^\s*((?:\d*\.)?\d+)\s*$/s) {
1287    no integer;
1288    $para->[1]{'indent'} = $1;
1289    if($1 == 0) {
1290      $self->whine(
1291        $para->[1]{'start_line'},
1292        "Can't have a 0 in =over $content"
1293      );
1294      $para->[1]{'indent'} = 4;
1295    }
1296  } else {
1297    $self->whine(
1298      $para->[1]{'start_line'},
1299      "=over should be: '=over' or '=over positive_number'"
1300    );
1301    $para->[1]{'indent'} = 4;
1302  }
1303  DEBUG > 1 and print STDERR "=over found of type $list_type\n";
1304
1305  $self->{'content_seen'} ||= 1;
1306  $self->_handle_element_start((my $scratch = 'over-' . $list_type), $para->[1]);
1307
1308  return;
1309}
1310
1311sub _ponder_back {
1312  my ($self,$para,$curr_open,$paras) = @_;
1313  # TODO: fire off </item-number> or </item-bullet> or </item-text> ??
1314
1315  my $content = join ' ', splice @$para, 2;
1316  if($content =~ m/\S/) {
1317    $self->whine(
1318      $para->[1]{'start_line'},
1319      "=back doesn't take any parameters, but you said =back $content"
1320    );
1321  }
1322
1323  if(@$curr_open and $curr_open->[-1][0] eq '=over') {
1324    DEBUG > 1 and print STDERR "=back happily closes matching =over\n";
1325    # Expected case: we're closing the most recently opened thing
1326    #my $over = pop @$curr_open;
1327    $self->{'content_seen'} ||= 1;
1328    $self->_handle_element_end( my $scratch =
1329      'over-' . ( (pop @$curr_open)->[1]{'~type'} ), $para->[1]
1330    );
1331  } else {
1332    DEBUG > 1 and print STDERR "=back found without a matching =over.  Stack: (",
1333        join(', ', map $_->[0], @$curr_open), ").\n";
1334    $self->whine(
1335      $para->[1]{'start_line'},
1336      '=back without =over'
1337    );
1338    return 1; # and ignore it
1339  }
1340}
1341
1342sub _ponder_item {
1343  my ($self,$para,$curr_open,$paras) = @_;
1344  my $over;
1345  unless(@$curr_open and
1346         $over = (grep { $_->[0] eq '=over' } @$curr_open)[-1]) {
1347    $self->whine(
1348      $para->[1]{'start_line'},
1349      "'=item' outside of any '=over'"
1350    );
1351    unshift @$paras,
1352      ['=over', {'start_line' => $para->[1]{'start_line'}}, ''],
1353      $para
1354    ;
1355    return 1;
1356  }
1357
1358
1359  my $over_type = $over->[1]{'~type'};
1360
1361  if(!$over_type) {
1362    # Shouldn't happen1
1363    die "Typeless over in stack, starting at line "
1364     . $over->[1]{'start_line'};
1365
1366  } elsif($over_type eq 'block') {
1367    unless($curr_open->[-1][1]{'~bitched_about'}) {
1368      $curr_open->[-1][1]{'~bitched_about'} = 1;
1369      $self->whine(
1370        $curr_open->[-1][1]{'start_line'},
1371        "You can't have =items (as at line "
1372        . $para->[1]{'start_line'}
1373        . ") unless the first thing after the =over is an =item"
1374      );
1375    }
1376    # Just turn it into a paragraph and reconsider it
1377    $para->[0] = '~Para';
1378    unshift @$paras, $para;
1379    return 1;
1380
1381  } elsif($over_type eq 'text') {
1382    my $item_type = $self->_get_item_type($para);
1383      # That kills the content of the item if it's a number or bullet.
1384    DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n";
1385
1386    if($item_type eq 'text') {
1387      # Nothing special needs doing for 'text'
1388    } elsif($item_type eq 'number' or $item_type eq 'bullet') {
1389      $self->whine(
1390          $para->[1]{'start_line'},
1391          "Expected text after =item, not a $item_type"
1392      );
1393      # Undo our clobbering:
1394      push @$para, $para->[1]{'~orig_content'};
1395      delete $para->[1]{'number'};
1396       # Only a PROPER item-number element is allowed
1397       #  to have a number attribute.
1398    } else {
1399      die "Unhandled item type $item_type"; # should never happen
1400    }
1401
1402    # =item-text thingies don't need any assimilation, it seems.
1403
1404  } elsif($over_type eq 'number') {
1405    my $item_type = $self->_get_item_type($para);
1406      # That kills the content of the item if it's a number or bullet.
1407    DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n";
1408
1409    my $expected_value = ++ $curr_open->[-1][1]{'~counter'};
1410
1411    if($item_type eq 'bullet') {
1412      # Hm, it's not numeric.  Correct for this.
1413      $para->[1]{'number'} = $expected_value;
1414      $self->whine(
1415        $para->[1]{'start_line'},
1416        "Expected '=item $expected_value'"
1417      );
1418      push @$para, $para->[1]{'~orig_content'};
1419        # restore the bullet, blocking the assimilation of next para
1420
1421    } elsif($item_type eq 'text') {
1422      # Hm, it's not numeric.  Correct for this.
1423      $para->[1]{'number'} = $expected_value;
1424      $self->whine(
1425        $para->[1]{'start_line'},
1426        "Expected '=item $expected_value'"
1427      );
1428      # Text content will still be there and will block next ~Para
1429
1430    } elsif($item_type ne 'number') {
1431      die "Unknown item type $item_type"; # should never happen
1432
1433    } elsif($expected_value == $para->[1]{'number'}) {
1434      DEBUG > 1 and print STDERR " Numeric item has the expected value of $expected_value\n";
1435
1436    } else {
1437      DEBUG > 1 and print STDERR " Numeric item has ", $para->[1]{'number'},
1438       " instead of the expected value of $expected_value\n";
1439      $self->whine(
1440        $para->[1]{'start_line'},
1441        "You have '=item " . $para->[1]{'number'} .
1442        "' instead of the expected '=item $expected_value'"
1443      );
1444      $para->[1]{'number'} = $expected_value;  # correcting!!
1445    }
1446
1447    if(@$para == 2) {
1448      # For the cases where we /didn't/ push to @$para
1449      if($paras->[0][0] eq '~Para') {
1450        DEBUG and print STDERR "Assimilating following ~Para content into $over_type item\n";
1451        push @$para, splice @{shift @$paras},2;
1452      } else {
1453        DEBUG and print STDERR "Can't assimilate following ", $paras->[0][0], "\n";
1454        push @$para, '';  # Just so it's not contentless
1455      }
1456    }
1457
1458
1459  } elsif($over_type eq 'bullet') {
1460    my $item_type = $self->_get_item_type($para);
1461      # That kills the content of the item if it's a number or bullet.
1462    DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n";
1463
1464    if($item_type eq 'bullet') {
1465      # as expected!
1466
1467      if( $para->[1]{'~_freaky_para_hack'} ) {
1468        DEBUG and print STDERR "Accomodating '=item * Foo' tolerance hack.\n";
1469        push @$para, delete $para->[1]{'~_freaky_para_hack'};
1470      }
1471
1472    } elsif($item_type eq 'number') {
1473      $self->whine(
1474        $para->[1]{'start_line'},
1475        "Expected '=item *'"
1476      );
1477      push @$para, $para->[1]{'~orig_content'};
1478       # and block assimilation of the next paragraph
1479      delete $para->[1]{'number'};
1480       # Only a PROPER item-number element is allowed
1481       #  to have a number attribute.
1482    } elsif($item_type eq 'text') {
1483      $self->whine(
1484        $para->[1]{'start_line'},
1485        "Expected '=item *'"
1486      );
1487       # But doesn't need processing.  But it'll block assimilation
1488       #  of the next para.
1489    } else {
1490      die "Unhandled item type $item_type"; # should never happen
1491    }
1492
1493    if(@$para == 2) {
1494      # For the cases where we /didn't/ push to @$para
1495      if($paras->[0][0] eq '~Para') {
1496        DEBUG and print STDERR "Assimilating following ~Para content into $over_type item\n";
1497        push @$para, splice @{shift @$paras},2;
1498      } else {
1499        DEBUG and print STDERR "Can't assimilate following ", $paras->[0][0], "\n";
1500        push @$para, '';  # Just so it's not contentless
1501      }
1502    }
1503
1504  } else {
1505    die "Unhandled =over type \"$over_type\"?";
1506    # Shouldn't happen!
1507  }
1508  $para->[0] .= '-' . $over_type;
1509
1510  return;
1511}
1512
1513sub _ponder_Plain {
1514  my ($self,$para) = @_;
1515  DEBUG and print STDERR " giving plain treatment...\n";
1516  unless( @$para == 2 or ( @$para == 3 and $para->[2] eq '' )
1517    or $para->[1]{'~cooked'}
1518  ) {
1519    push @$para,
1520    @{$self->_make_treelet(
1521      join("\n", splice(@$para, 2)),
1522      $para->[1]{'start_line'}
1523    )};
1524  }
1525  # Empty paragraphs don't need a treelet for any reason I can see.
1526  # And precooked paragraphs already have a treelet.
1527  return;
1528}
1529
1530sub _ponder_Verbatim {
1531  my ($self,$para) = @_;
1532  DEBUG and print STDERR " giving verbatim treatment...\n";
1533
1534  $para->[1]{'xml:space'} = 'preserve';
1535
1536  my $indent = $self->strip_verbatim_indent;
1537  if ($indent && ref $indent eq 'CODE') {
1538      my @shifted = (shift @{$para}, shift @{$para});
1539      $indent = $indent->($para);
1540      unshift @{$para}, @shifted;
1541  }
1542
1543  for(my $i = 2; $i < @$para; $i++) {
1544    foreach my $line ($para->[$i]) { # just for aliasing
1545      # Strip indentation.
1546      $line =~ s/^\Q$indent// if $indent
1547          && !($self->{accept_codes} && $self->{accept_codes}{VerbatimFormatted});
1548      while( $line =~
1549        # Sort of adapted from Text::Tabs -- yes, it's hardwired in that
1550        # tabs are at every EIGHTH column.  For portability, it has to be
1551        # one setting everywhere, and 8th wins.
1552        s/^([^\t]*)(\t+)/$1.(" " x ((length($2)<<3)-(length($1)&7)))/e
1553      ) {}
1554
1555      # TODO: whinge about (or otherwise treat) unindented or overlong lines
1556
1557    }
1558  }
1559
1560  # Now the VerbatimFormatted hoodoo...
1561  if( $self->{'accept_codes'} and
1562      $self->{'accept_codes'}{'VerbatimFormatted'}
1563  ) {
1564    while(@$para > 3 and $para->[-1] !~ m/\S/) { pop @$para }
1565     # Kill any number of terminal newlines
1566    $self->_verbatim_format($para);
1567  } elsif ($self->{'codes_in_verbatim'}) {
1568    push @$para,
1569    @{$self->_make_treelet(
1570      join("\n", splice(@$para, 2)),
1571      $para->[1]{'start_line'}, $para->[1]{'xml:space'}
1572    )};
1573    $para->[-1] =~ s/\n+$//s; # Kill any number of terminal newlines
1574  } else {
1575    push @$para, join "\n", splice(@$para, 2) if @$para > 3;
1576    $para->[-1] =~ s/\n+$//s; # Kill any number of terminal newlines
1577  }
1578  return;
1579}
1580
1581sub _ponder_Data {
1582  my ($self,$para) = @_;
1583  DEBUG and print STDERR " giving data treatment...\n";
1584  $para->[1]{'xml:space'} = 'preserve';
1585  push @$para, join "\n", splice(@$para, 2) if @$para > 3;
1586  return;
1587}
1588
1589
1590
1591
1592###########################################################################
1593
1594sub _traverse_treelet_bit {  # for use only by the routine above
1595  my($self, $name) = splice @_,0,2;
1596
1597  my $scratch;
1598  $self->_handle_element_start(($scratch=$name), shift @_);
1599
1600  while (@_) {
1601    my $x = shift;
1602    if (ref($x)) {
1603      &_traverse_treelet_bit($self, @$x);
1604    } else {
1605      $x .= shift while @_ && !ref($_[0]);
1606      $self->_handle_text($x);
1607    }
1608  }
1609
1610  $self->_handle_element_end($scratch=$name);
1611  return;
1612}
1613
1614#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1615
1616sub _closers_for_all_curr_open {
1617  my $self = $_[0];
1618  my @closers;
1619  foreach my $still_open (@{  $self->{'curr_open'} || return  }) {
1620    my @copy = @$still_open;
1621    $copy[1] = {%{ $copy[1] }};
1622    #$copy[1]{'start_line'} = -1;
1623    if($copy[0] eq '=for') {
1624      $copy[0] = '=end';
1625    } elsif($copy[0] eq '=over') {
1626      $self->whine(
1627        $still_open->[1]{start_line} ,
1628        "=over without closing =back"
1629      );
1630
1631      $copy[0] = '=back';
1632    } else {
1633      die "I don't know how to auto-close an open $copy[0] region";
1634    }
1635
1636    unless( @copy > 2 ) {
1637      push @copy, $copy[1]{'target'};
1638      $copy[-1] = '' unless defined $copy[-1];
1639       # since =over's don't have targets
1640    }
1641
1642    $copy[1]{'fake-closer'} = 1;
1643
1644    DEBUG and print STDERR "Queuing up fake-o event: ", pretty(\@copy), "\n";
1645    unshift @closers, \@copy;
1646  }
1647  return @closers;
1648}
1649
1650#--------------------------------------------------------------------------
1651
1652sub _verbatim_format {
1653  my($it, $p) = @_;
1654
1655  my $formatting;
1656
1657  for(my $i = 2; $i < @$p; $i++) { # work backwards over the lines
1658    DEBUG and print STDERR "_verbatim_format appends a newline to $i: $p->[$i]\n";
1659    $p->[$i] .= "\n";
1660     # Unlike with simple Verbatim blocks, we don't end up just doing
1661     # a join("\n", ...) on the contents, so we have to append a
1662     # newline to ever line, and then nix the last one later.
1663  }
1664
1665  if( DEBUG > 4 ) {
1666    print STDERR "<<\n";
1667    for(my $i = $#$p; $i >= 2; $i--) { # work backwards over the lines
1668      print STDERR "_verbatim_format $i: $p->[$i]";
1669    }
1670    print STDERR ">>\n";
1671  }
1672
1673  for(my $i = $#$p; $i > 2; $i--) {
1674    # work backwards over the lines, except the first (#2)
1675
1676    #next unless $p->[$i]   =~ m{^#:([ \^\/\%]*)\n?$}s
1677    #        and $p->[$i-1] !~ m{^#:[ \^\/\%]*\n?$}s;
1678     # look at a formatty line preceding a nonformatty one
1679    DEBUG > 5 and print STDERR "Scrutinizing line $i: $$p[$i]\n";
1680    if($p->[$i]   =~ m{^#:([ \^\/\%]*)\n?$}s) {
1681      DEBUG > 5 and print STDERR "  It's a formatty line.  ",
1682       "Peeking at previous line ", $i-1, ": $$p[$i-1]: \n";
1683
1684      if( $p->[$i-1] =~ m{^#:[ \^\/\%]*\n?$}s ) {
1685        DEBUG > 5 and print STDERR "  Previous line is formatty!  Skipping this one.\n";
1686        next;
1687      } else {
1688        DEBUG > 5 and print STDERR "  Previous line is non-formatty!  Yay!\n";
1689      }
1690    } else {
1691      DEBUG > 5 and print STDERR "  It's not a formatty line.  Ignoring\n";
1692      next;
1693    }
1694
1695    # A formatty line has to have #: in the first two columns, and uses
1696    # "^" to mean bold, "/" to mean underline, and "%" to mean bold italic.
1697    # Example:
1698    #   What do you want?  i like pie. [or whatever]
1699    # #:^^^^^^^^^^^^^^^^^              /////////////
1700
1701
1702    DEBUG > 4 and print STDERR "_verbatim_format considers:\n<$p->[$i-1]>\n<$p->[$i]>\n";
1703
1704    $formatting = '  ' . $1;
1705    $formatting =~ s/\s+$//s; # nix trailing whitespace
1706    unless(length $formatting and $p->[$i-1] =~ m/\S/) { # no-op
1707      splice @$p,$i,1; # remove this line
1708      $i--; # don't consider next line
1709      next;
1710    }
1711
1712    if( length($formatting) >= length($p->[$i-1]) ) {
1713      $formatting = substr($formatting, 0, length($p->[$i-1]) - 1) . ' ';
1714    } else {
1715      $formatting .= ' ' x (length($p->[$i-1]) - length($formatting));
1716    }
1717    # Make $formatting and the previous line be exactly the same length,
1718    # with $formatting having a " " as the last character.
1719
1720    DEBUG > 4 and print STDERR "Formatting <$formatting>    on <", $p->[$i-1], ">\n";
1721
1722
1723    my @new_line;
1724    while( $formatting =~ m{\G(( +)|(\^+)|(\/+)|(\%+))}g ) {
1725      #print STDERR "Format matches $1\n";
1726
1727      if($2) {
1728        #print STDERR "SKIPPING <$2>\n";
1729        push @new_line,
1730          substr($p->[$i-1], pos($formatting)-length($1), length($1));
1731      } else {
1732        #print STDERR "SNARING $+\n";
1733        push @new_line, [
1734          (
1735            $3 ? 'VerbatimB'  :
1736            $4 ? 'VerbatimI'  :
1737            $5 ? 'VerbatimBI' : die("Should never get called")
1738          ), {},
1739          substr($p->[$i-1], pos($formatting)-length($1), length($1))
1740        ];
1741        #print STDERR "Formatting <$new_line[-1][-1]> as $new_line[-1][0]\n";
1742      }
1743    }
1744    my @nixed =
1745      splice @$p, $i-1, 2, @new_line; # replace myself and the next line
1746    DEBUG > 10 and print STDERR "Nixed count: ", scalar(@nixed), "\n";
1747
1748    DEBUG > 6 and print STDERR "New version of the above line is these tokens (",
1749      scalar(@new_line), "):",
1750      map( ref($_)?"<@$_> ":"<$_>", @new_line ), "\n";
1751    $i--; # So the next line we scrutinize is the line before the one
1752          #  that we just went and formatted
1753  }
1754
1755  $p->[0] = 'VerbatimFormatted';
1756
1757  # Collapse adjacent text nodes, just for kicks.
1758  for( my $i = 2; $i > $#$p; $i++ ) { # work forwards over the tokens except for the last
1759    if( !ref($p->[$i]) and !ref($p->[$i + 1]) ) {
1760      DEBUG > 5 and print STDERR "_verbatim_format merges {$p->[$i]} and {$p->[$i+1]}\n";
1761      $p->[$i] .= splice @$p, $i+1, 1; # merge
1762      --$i;  # and back up
1763    }
1764  }
1765
1766  # Now look for the last text token, and remove the terminal newline
1767  for( my $i = $#$p; $i >= 2; $i-- ) {
1768    # work backwards over the tokens, even the first
1769    if( !ref($p->[$i]) ) {
1770      if($p->[$i] =~ s/\n$//s) {
1771        DEBUG > 5 and print STDERR "_verbatim_format killed the terminal newline on #$i: {$p->[$i]}, after {$p->[$i-1]}\n";
1772      } else {
1773        DEBUG > 5 and print STDERR
1774         "No terminal newline on #$i: {$p->[$i]}, after {$p->[$i-1]} !?\n";
1775      }
1776      last; # we only want the next one
1777    }
1778  }
1779
1780  return;
1781}
1782
1783
1784#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1785
1786
1787sub _treelet_from_formatting_codes {
1788  # Given a paragraph, returns a treelet.  Full of scary tokenizing code.
1789  #  Like [ '~Top', {'start_line' => $start_line},
1790  #            "I like ",
1791  #            [ 'B', {}, "pie" ],
1792  #            "!"
1793  #       ]
1794
1795  my($self, $para, $start_line, $preserve_space) = @_;
1796
1797  my $treelet = ['~Top', {'start_line' => $start_line},];
1798
1799  unless ($preserve_space || $self->{'preserve_whitespace'}) {
1800    $para =~ s/\s+/ /g; # collapse and trim all whitespace first.
1801    $para =~ s/ $//;
1802    $para =~ s/^ //;
1803  }
1804
1805  # Only apparent problem the above code is that N<<  >> turns into
1806  # N<< >>.  But then, word wrapping does that too!  So don't do that!
1807
1808  my @stack;
1809  my @lineage = ($treelet);
1810  my $raw = ''; # raw content of L<> fcode before splitting/processing
1811    # XXX 'raw' is not 100% accurate: all surrounding whitespace is condensed
1812    # into just 1 ' '. Is this the regex's doing or 'raw's?
1813  my $inL = 0;
1814
1815  DEBUG > 4 and print STDERR "Paragraph:\n$para\n\n";
1816
1817  # Here begins our frightening tokenizer RE.  The following regex matches
1818  # text in four main parts:
1819  #
1820  #  * Start-codes.  The first alternative matches C< or C<<, the latter
1821  #    followed by some whitespace.  $1 will hold the entire start code
1822  #    (including any space following a multiple-angle-bracket delimiter),
1823  #    and $2 will hold only the additional brackets past the first in a
1824  #    multiple-bracket delimiter.  length($2) + 1 will be the number of
1825  #    closing brackets we have to find.
1826  #
1827  #  * Closing brackets.  Match some amount of whitespace followed by
1828  #    multiple close brackets.  The logic to see if this closes anything
1829  #    is down below.  Note that in order to parse C<<  >> correctly, we
1830  #    have to use look-behind (?<=\s\s), since the match of the starting
1831  #    code will have consumed the whitespace.
1832  #
1833  #  * A single closing bracket, to close a simple code like C<>.
1834  #
1835  #  * Something that isn't a start or end code.  We have to be careful
1836  #    about accepting whitespace, since perlpodspec says that any whitespace
1837  #    before a multiple-bracket closing delimiter should be ignored.
1838  #
1839  while($para =~
1840    m/\G
1841      (?:
1842        # Match starting codes, including the whitespace following a
1843        # multiple-delimiter start code.  $1 gets the whole start code and
1844        # $2 gets all but one of the <s in the multiple-bracket case.
1845        ([A-Z]<(?:(<+)\s+)?)
1846        |
1847        # Match multiple-bracket end codes.  $3 gets the whitespace that
1848        # should be discarded before an end bracket but kept in other cases
1849        # and $4 gets the end brackets themselves.
1850        (\s+|(?<=\s\s))(>{2,})
1851        |
1852        (\s?>)          # $5: simple end-codes
1853        |
1854        (               # $6: stuff containing no start-codes or end-codes
1855          (?:
1856            [^A-Z\s>]
1857            |
1858            (?:
1859              [A-Z](?!<)
1860            )
1861            |
1862            # whitespace is ok, but we don't want to eat the whitespace before
1863            # a multiple-bracket end code.
1864            # NOTE: we may still have problems with e.g. S<<    >>
1865            (?:
1866              \s(?!\s*>{2,})
1867            )
1868          )+
1869        )
1870      )
1871    /xgo
1872  ) {
1873    DEBUG > 4 and print STDERR "\nParagraphic tokenstack = (@stack)\n";
1874    if(defined $1) {
1875      if(defined $2) {
1876        DEBUG > 3 and print STDERR "Found complex start-text code \"$1\"\n";
1877        push @stack, length($2) + 1;
1878          # length of the necessary complex end-code string
1879      } else {
1880        DEBUG > 3 and print STDERR "Found simple start-text code \"$1\"\n";
1881        push @stack, 0;  # signal that we're looking for simple
1882      }
1883      push @lineage, [ substr($1,0,1), {}, ];  # new node object
1884      push @{ $lineage[-2] }, $lineage[-1];
1885      if ('L' eq substr($1,0,1)) {
1886        $raw = $inL ? $raw.$1 : ''; # reset raw content accumulator
1887        $inL = 1;
1888      } else {
1889        $raw .= $1 if $inL;
1890      }
1891
1892    } elsif(defined $4) {
1893      DEBUG > 3 and print STDERR "Found apparent complex end-text code \"$3$4\"\n";
1894      # This is where it gets messy...
1895      if(! @stack) {
1896        # We saw " >>>>" but needed nothing.  This is ALL just stuff then.
1897        DEBUG > 4 and print STDERR " But it's really just stuff.\n";
1898        push @{ $lineage[-1] }, $3, $4;
1899        next;
1900      } elsif(!$stack[-1]) {
1901        # We saw " >>>>" but needed only ">".  Back pos up.
1902        DEBUG > 4 and print STDERR " And that's more than we needed to close simple.\n";
1903        push @{ $lineage[-1] }, $3; # That was a for-real space, too.
1904        pos($para) = pos($para) - length($4) + 1;
1905      } elsif($stack[-1] == length($4)) {
1906        # We found " >>>>", and it was exactly what we needed.  Commonest case.
1907        DEBUG > 4 and print STDERR " And that's exactly what we needed to close complex.\n";
1908      } elsif($stack[-1] < length($4)) {
1909        # We saw " >>>>" but needed only " >>".  Back pos up.
1910        DEBUG > 4 and print STDERR " And that's more than we needed to close complex.\n";
1911        pos($para) = pos($para) - length($4) + $stack[-1];
1912      } else {
1913        # We saw " >>>>" but needed " >>>>>>".  So this is all just stuff!
1914        DEBUG > 4 and print STDERR " But it's really just stuff, because we needed more.\n";
1915        push @{ $lineage[-1] }, $3, $4;
1916        next;
1917      }
1918      #print STDERR "\nHOOBOY ", scalar(@{$lineage[-1]}), "!!!\n";
1919
1920      push @{ $lineage[-1] }, '' if 2 == @{ $lineage[-1] };
1921      # Keep the element from being childless
1922
1923      pop @stack;
1924      pop @lineage;
1925
1926      unless (@stack) { # not in an L if there are no open fcodes
1927        $inL = 0;
1928        if (ref $lineage[-1][-1] && $lineage[-1][-1][0] eq 'L') {
1929          $lineage[-1][-1][1]{'raw'} = $raw
1930        }
1931      }
1932      $raw .= $3.$4 if $inL;
1933
1934    } elsif(defined $5) {
1935      DEBUG > 3 and print STDERR "Found apparent simple end-text code \"$5\"\n";
1936
1937      if(@stack and ! $stack[-1]) {
1938        # We're indeed expecting a simple end-code
1939        DEBUG > 4 and print STDERR " It's indeed an end-code.\n";
1940
1941        if(length($5) == 2) { # There was a space there: " >"
1942          push @{ $lineage[-1] }, ' ';
1943        } elsif( 2 == @{ $lineage[-1] } ) { # Closing a childless element
1944          push @{ $lineage[-1] }, ''; # keep it from being really childless
1945        }
1946
1947        pop @stack;
1948        pop @lineage;
1949      } else {
1950        DEBUG > 4 and print STDERR " It's just stuff.\n";
1951        push @{ $lineage[-1] }, $5;
1952      }
1953
1954      unless (@stack) { # not in an L if there are no open fcodes
1955        $inL = 0;
1956        if (ref $lineage[-1][-1] && $lineage[-1][-1][0] eq 'L') {
1957          $lineage[-1][-1][1]{'raw'} = $raw
1958        }
1959      }
1960      $raw .= $5 if $inL;
1961
1962    } elsif(defined $6) {
1963      DEBUG > 3 and print STDERR "Found stuff \"$6\"\n";
1964      push @{ $lineage[-1] }, $6;
1965      $raw .= $6 if $inL;
1966        # XXX does not capture multiplace whitespaces -- 'raw' ends up with
1967        #     at most 1 leading/trailing whitespace, why not all of it?
1968
1969    } else {
1970      # should never ever ever ever happen
1971      DEBUG and print STDERR "AYYAYAAAAA at line ", __LINE__, "\n";
1972      die "SPORK 512512!";
1973    }
1974  }
1975
1976  if(@stack) { # Uhoh, some sequences weren't closed.
1977    my $x= "...";
1978    while(@stack) {
1979      push @{ $lineage[-1] }, '' if 2 == @{ $lineage[-1] };
1980      # Hmmmmm!
1981
1982      my $code         = (pop @lineage)->[0];
1983      my $ender_length =  pop @stack;
1984      if($ender_length) {
1985        --$ender_length;
1986        $x = $code . ("<" x $ender_length) . " $x " . (">" x $ender_length);
1987      } else {
1988        $x = $code . "<$x>";
1989      }
1990    }
1991    DEBUG > 1 and print STDERR "Unterminated $x sequence\n";
1992    $self->whine($start_line,
1993      "Unterminated $x sequence",
1994    );
1995  }
1996
1997  return $treelet;
1998}
1999
2000#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2001
2002sub text_content_of_treelet {  # method: $parser->text_content_of_treelet($lol)
2003  return stringify_lol($_[1]);
2004}
2005
2006sub stringify_lol {  # function: stringify_lol($lol)
2007  my $string_form = '';
2008  _stringify_lol( $_[0] => \$string_form );
2009  return $string_form;
2010}
2011
2012sub _stringify_lol {  # the real recursor
2013  my($lol, $to) = @_;
2014  for(my $i = 2; $i < @$lol; ++$i) {
2015    if( ref($lol->[$i] || '') and UNIVERSAL::isa($lol->[$i], 'ARRAY') ) {
2016      _stringify_lol( $lol->[$i], $to);  # recurse!
2017    } else {
2018      $$to .= $lol->[$i];
2019    }
2020  }
2021  return;
2022}
2023
2024#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2025
2026sub _dump_curr_open { # return a string representation of the stack
2027  my $curr_open = $_[0]{'curr_open'};
2028
2029  return '[empty]' unless @$curr_open;
2030  return join '; ',
2031    map {;
2032           ($_->[0] eq '=for')
2033             ? ( ($_->[1]{'~really'} || '=over')
2034               . ' ' . $_->[1]{'target'})
2035             : $_->[0]
2036        }
2037    @$curr_open
2038  ;
2039}
2040
2041###########################################################################
2042my %pretty_form = (
2043  "\a" => '\a', # ding!
2044  "\b" => '\b', # BS
2045  "\e" => '\e', # ESC
2046  "\f" => '\f', # FF
2047  "\t" => '\t', # tab
2048  "\cm" => '\cm',
2049  "\cj" => '\cj',
2050  "\n" => '\n', # probably overrides one of either \cm or \cj
2051  '"' => '\"',
2052  '\\' => '\\\\',
2053  '$' => '\\$',
2054  '@' => '\\@',
2055  '%' => '\\%',
2056  '#' => '\\#',
2057);
2058
2059sub pretty { # adopted from Class::Classless
2060  # Not the most brilliant routine, but passable.
2061  # Don't give it a cyclic data structure!
2062  my @stuff = @_; # copy
2063  my $x;
2064  my $out =
2065    # join ",\n" .
2066    join ", ",
2067    map {;
2068    if(!defined($_)) {
2069      "undef";
2070    } elsif(ref($_) eq 'ARRAY' or ref($_) eq 'Pod::Simple::LinkSection') {
2071      $x = "[ " . pretty(@$_) . " ]" ;
2072      $x;
2073    } elsif(ref($_) eq 'SCALAR') {
2074      $x = "\\" . pretty($$_) ;
2075      $x;
2076    } elsif(ref($_) eq 'HASH') {
2077      my $hr = $_;
2078      $x = "{" . join(", ",
2079        map(pretty($_) . '=>' . pretty($hr->{$_}),
2080            sort keys %$hr ) ) . "}" ;
2081      $x;
2082    } elsif(!length($_)) { q{''} # empty string
2083    } elsif(
2084      $_ eq '0' # very common case
2085      or(
2086         m/^-?(?:[123456789]\d*|0)(?:\.\d+)?$/s
2087         and $_ ne '-0' # the strange case that RE lets thru
2088      )
2089    ) { $_;
2090    } else {
2091        # Yes, explicitly name every character desired. There are shorcuts one
2092        # could make, but I (Karl Williamson) was afraid that some Perl
2093        # releases would have bugs in some of them. For example [A-Z] works
2094        # even on EBCDIC platforms to match exactly the 26 uppercase English
2095        # letters, but I don't know if it has always worked without bugs. It
2096        # seemed safest just to list the characters.
2097        # s<([^\x20\x21\x23\x27-\x3F\x41-\x5B\x5D-\x7E])>
2098        s<([^ !#'()*+,\-./0123456789:;\<=\>?ABCDEFGHIJKLMNOPQRSTUVWXYZ\[\]^_`abcdefghijklmnopqrstuvwxyz{|}~])>
2099         <$pretty_form{$1} || '\\x{'.sprintf("%x", ord($1)).'}'>eg;
2100         #<$pretty_form{$1} || '\\x'.(unpack("H2",$1))>eg;
2101      qq{"$_"};
2102    }
2103  } @stuff;
2104  # $out =~ s/\n */ /g if length($out) < 75;
2105  return $out;
2106}
2107
2108#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2109
2110# A rather unsubtle method of blowing away all the state information
2111# from a parser object so it can be reused. Provided as a utility for
2112# backward compatibility in Pod::Man, etc. but not recommended for
2113# general use.
2114
2115sub reinit {
2116  my $self = shift;
2117  foreach (qw(source_dead source_filename doc_has_started
2118start_of_pod_block content_seen last_was_blank paras curr_open
2119line_count pod_para_count in_pod ~tried_gen_errata all_errata errata errors_seen
2120Title)) {
2121
2122    delete $self->{$_};
2123  }
2124}
2125
2126#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
21271;
2128
2129