1#=======================================================================
2#
3#   THIS IS A REUSED PERL MODULE, FOR PROPER LICENCING TERMS SEE BELOW:
4#
5#   Copyright Martin Hosken <Martin_Hosken@sil.org>
6#
7#   Modified for PDF::API2 by Alfred Reibenschuh <alfredreibenschuh@gmx.net>
8#
9#   No warranty or expression of effectiveness, least of all regarding
10#   anyone's safety, is implied in this software or documentation.
11#
12#   This specific module is licensed under the Perl Artistic License.
13#   Effective 28 January 2021, the original author and copyright holder,
14#   Martin Hosken, has given permission to use and redistribute this module
15#   under the MIT license.
16#
17#=======================================================================
18package PDF::Builder::Basic::PDF::File;
19
20use strict;
21use warnings;
22
23our $VERSION = '3.023'; # VERSION
24our $LAST_UPDATE = '3.023'; # manually update whenever code is changed
25
26=head1 NAME
27
28PDF::Builder::Basic::PDF::File - Holds the trailers and cross-reference tables for a PDF file
29
30=head1 SYNOPSIS
31
32 $p = PDF::Builder::Basic::PDF::File->open("filename.pdf", 1);
33 $p->new_obj($obj_ref);
34 $p->free_obj($obj_ref);
35 $p->append_file();
36 $p->close_file();
37 $p->release();       # IMPORTANT!
38
39=head1 DESCRIPTION
40
41This class keeps track of the directory aspects of a PDF file. There are two
42parts to the directory: the main directory object, which is the parent to all
43other objects, and a chain of cross-reference tables and corresponding trailer
44dictionaries, starting with the main directory object.
45
46=head1 INSTANCE VARIABLES
47
48Within this class hierarchy, rather than making everything visible via methods,
49which would be a lot of work, there are various instance variables which are
50accessible via associative array referencing. To distinguish instance variables
51from content variables (which may come from the PDF content itself), each such
52variable will start with a space.
53
54Variables which do not start with a space directly reflect elements in a PDF
55dictionary. In the case of a C<PDF::Builder::Basic::PDF::File>, the elements
56reflect those in the trailer dictionary.
57
58Since some variables are not designed for class users to access, variables are
59marked in the documentation with B<(R)> to indicate that such an entry should
60only be used as B<read-only> information. B<(P)> indicates that the information
61is B<private>, and not designed for user use at all, but is included in the
62documentation for completeness and to ensure that nobody else tries to use it.
63
64=over
65
66=item newroot
67
68This variable allows the user to create a new root entry to occur in the trailer
69dictionary which is output when the file is written or appended. If you wish to
70override the root element in the dictionary you have, use this entry to indicate
71that without losing the current Root entry. Notice that newroot should point to
72a PDF level object and not just to a dictionary which does not have object
73status.
74
75=item INFILE (R)
76
77Contains the filehandle used to read this information into this PDF directory.
78It is an IO object.
79
80=item fname (R)
81
82This is the filename which is reflected by INFILE, or the original IO object
83passed in.
84
85=item update (R)
86
87This indicates that the read file has been opened for update and that at some
88point, C<< $p->appendfile() >> can be called to update the file with the
89changes that have been made to the memory representation.
90
91=item maxobj (R)
92
93Contains the first usable object number above any that have already appeared
94in the file so far.
95
96=item outlist (P)
97
98This is a list of Objind which are to be output when the next C<appendfile()>
99or C<outfile()> occurs.
100
101=item firstfree (P)
102
103Contains the first free object in the free object list. Free objects are removed
104from the front of the list and added to the end.
105
106=item lastfree (P)
107
108Contains the last free object in the free list. It may be the same as the
109C<firstfree> if there is only one free object.
110
111=item objcache (P)
112
113All objects are held in the cache to ensure that a system only has one
114occurrence of each object. In effect, the objind class acts as a container type
115class to hold the PDF object structure, and it would be unfortunate if there
116were two identical place-holders floating around a system.
117
118=item epos (P)
119
120The end location of the read-file.
121
122=back
123
124Each trailer dictionary contains a number of private instance variables which
125hold the chain together.
126
127=over
128
129=item loc (P)
130
131Contains the location of the start of the cross-reference table preceding the
132trailer.
133
134=item xref (P)
135
136Contains an anonymous array of each cross-reference table entry.
137
138=item prev (P)
139
140A reference to the previous table. Note this differs from the Prev entry which
141is in PDF, which contains the location of the previous cross-reference table.
142
143=back
144
145=head1 METHODS
146
147=cut
148
149use Scalar::Util qw(blessed weaken);
150
151use vars qw($cr $irreg_char $reg_char $ws_char $delim_char %types);
152
153$ws_char    = '[ \t\r\n\f\0]';
154$delim_char = '[][<>{}()/%]';
155$reg_char   = '[^][<>{}()/% \t\r\n\f\0]';
156$irreg_char = '[][<>{}()/% \t\r\n\f\0]';
157# \015 = x0D = CR or \r, \012 = x0A = LF or \n
158# TBD a line-end character is space CR ' \r', space LF ' \n', or CR LF '\r\n'
159#     have seen working PDFs with just a CR and space CR
160$cr         = '\s*(?:\015|\012|(?:\015\012))';
161
162my $re_comment = qr/(?:\%[^\r\n]*)/;
163my $re_whitespace = qr/(?:[ \t\r\n\f\0]|$re_comment)/;
164
165%types = (
166    'Page'  => 'PDF::Builder::Basic::PDF::Page',
167    'Pages' => 'PDF::Builder::Basic::PDF::Pages',
168);
169
170my $readDebug = 0;
171
172use Carp;
173use IO::File;
174
175# Now for the basic PDF types
176use PDF::Builder::Basic::PDF::Utils;
177
178use PDF::Builder::Basic::PDF::Array;
179use PDF::Builder::Basic::PDF::Bool;
180use PDF::Builder::Basic::PDF::Dict;
181use PDF::Builder::Basic::PDF::Name;
182use PDF::Builder::Basic::PDF::Number;
183use PDF::Builder::Basic::PDF::Objind;
184use PDF::Builder::Basic::PDF::String;
185use PDF::Builder::Basic::PDF::Page;
186use PDF::Builder::Basic::PDF::Pages;
187use PDF::Builder::Basic::PDF::Null;
188use POSIX qw(ceil floor);
189
190=head2 PDF::Builder::Basic::PDF::File->new()
191
192Creates a new, empty file object which can act as the host to other PDF objects.
193Since there is no file associated with this object, it is assumed that the
194object is created in readiness for creating a new PDF file.
195
196=cut
197
198sub new {
199    my ($class, $root) = @_;
200    my $self = $class->_new();
201
202    unless ($root) {
203        $root = PDFDict();
204        $root->{'Type'} = PDFName('Catalog');
205    }
206    $self->new_obj($root);
207    $self->{'Root'} = $root;
208
209    return $self;
210}
211
212=head2 $p = PDF::Builder::Basic::PDF::File->open($filename, $update, %options)
213
214Opens the file and reads all the trailers and cross reference tables to build
215a complete directory of objects.
216
217C<$filename> may be a string or an IO object.
218
219C<$update> specifies whether this file is being opened for updating and editing
220(I<TRUE> value), or simply to be read (I<FALSE> or undefined value).
221
222C<%options> may include
223
224=over
225
226=item -diags => 1
227
228If C<-diags> is set to 1, various warning messages will be given if a
229suspicious PDF structure is found, and some fixup may be attempted. There is
230no guarantee that any fixup will change the PDF to legitimate, or that there
231won't be other problems found further down the line. If this flag is I<not>
232given, and a structural problem is found, it is fairly likely that errors (and
233even a program B<crash>) may happen further along. If you experience crashes
234when reading in a PDF file, try running with C<-diags> and see what is reported.
235
236There are many PDF files out "in the wild" which, while failing to conform to
237Adobe's standards, appear to be tolerated by PDF Readers. Thus, Builder will no
238longer fail on them, but merely comment on their existence.
239
240=back
241
242=cut
243
244sub open {
245    my ($class, $filename, $update, %options) = @_;
246    my ($fh, $buffer);
247    $options{'-diags'} = 0 if not defined $options{'-diags'}; # default
248
249    my $comment = ''; # any comment jammed into the PDF header
250    my $self = $class->_new();
251    if (ref $filename) {
252        $self->{' INFILE'} = $filename;
253        if ($update) {
254            $self->{' update'} = 1;
255            $self->{' OUTFILE'} = $filename;
256        }
257        $fh = $filename;
258    } else {
259        die "File '$filename' does not exist!" unless -f $filename;
260        $fh = IO::File->new(($update ? '+' : '') . "<$filename") || return;
261        $self->{' INFILE'} = $fh;
262        if ($update) {
263            $self->{' update'} = 1;
264            $self->{' OUTFILE'} = $fh;
265            $self->{' fname'} = $filename;
266        }
267    }
268    binmode $fh, ':raw';
269    $fh->seek(0, 0);            # go to start of file
270    $fh->read($buffer, 255);
271    unless ($buffer =~ m/^\%PDF\-(\d+\.\d+)(.*?)$cr/mo) {
272        die "$filename does not contain a PDF version";
273    }
274    $self->{' version'} = $1;
275    # can't run verCheckInput() yet, as full ' version' not set
276    if (defined $2 && length($2) > 0) {
277       $comment = $2;  # save for output as comment
278       # since we just echo the original header + comment, unless that causes
279       # problems in some Readers, we can just leave it be (no call to strip
280       # out inline comment and create a separate comment further along).
281    }
282
283    $fh->seek(0, 2);            # go to end of file
284    my $end = $fh->tell();
285    $self->{' epos'} = $end;
286    foreach my $offset (1 .. 64) {
287    	$fh->seek($end - 16 * $offset, 0);
288    	$fh->read($buffer, 16 * $offset);
289    	last if $buffer =~ m/startxref($cr|\s*)\d+($cr|\s*)\%\%eof.*?/i;
290    }
291    unless ($buffer =~ m/startxref[^\d]+([0-9]+)($cr|\s*)\%\%eof.*?/i) {
292	if ($options{'-diags'} == 1) {
293            warn "Malformed PDF file $filename"; #orig 'die'
294        }
295    }
296    my $xpos = $1;
297    $self->{' xref_position'} = $xpos;
298
299    my $tdict = $self->readxrtr($xpos, %options);
300    foreach my $key (keys %$tdict) {
301        $self->{$key} = $tdict->{$key};
302    }
303
304    return $self;
305} # end of open()
306
307=head2 $p->release()
308
309Releases ALL of the memory used by the PDF document and all of its
310component objects.  After calling this method, do B<NOT> expect to
311have anything left in the C<PDF::Builder::Basic::PDF::File> object
312(so if you need to save, be sure to do it before calling this method).
313
314B<NOTE>, that it is important that you call this method on any
315C<PDF::Builder::Basic::PDF::File> object when you wish to destroy it and
316free up its memory.  Internally, PDF files have an enormous number of
317cross-references, and this causes circular references within the
318internal data structures. Calling C<release()> causes a brute-force
319cleanup of the data structures, freeing up all of the memory. Once
320you've called this method, though, don't expect to be able to do
321anything else with the C<PDF::Builder::Basic::PDF::File> object; it'll
322have B<no> internal state whatsoever.
323
324=cut
325
326# Maintainer's Question: Couldn't this be handled by a DESTROY method
327# instead of requiring an explicit call to release()?
328sub release {
329    my $self = shift();
330
331    return $self unless ref($self);
332    my @tofree = values %$self;
333
334    foreach my $key (keys %$self) {
335        $self->{$key} = undef;
336        delete $self->{$key};
337    }
338
339    while (my $item = shift @tofree) {
340        if      (blessed($item) and $item->can('release')) {
341            $item->release(1);
342        } elsif (ref($item) eq 'ARRAY') {
343            push @tofree, @$item;
344        } elsif (ref($item) eq 'HASH') {
345            push @tofree, values %$item;
346            foreach my $key (keys %$item) {
347                $item->{$key} = undef;
348                delete $item->{$key};
349            }
350        } else {
351            $item = undef;
352        }
353    }
354
355    return;
356} # end of release()
357
358=head2 $p->append_file()
359
360Appends the objects for output to the read file and then appends the
361appropriate table.
362
363=cut
364
365sub append_file {
366    my $self = shift();
367    return unless $self->{' update'};
368
369    my $fh = $self->{' INFILE'};
370
371    # hack to upgrade pdf-version number to support requested features in
372    # higher versions than the pdf was originally created. WARNING: new version
373    # must be exactly SAME length as the old (e.g., 1.6 replacing 1.4), or
374    # problems are likely with overwriting header. perhaps some day we will
375    # need to check the old version being ovewritten, and adjust something to
376    # avoid corrupting the file.
377    my $version = $self->{' version'} || 1.4;
378    $fh->seek(0, 0);
379    # assume that any existing EOL after version will be reused
380    $fh->print("%PDF-$version");
381
382    my $tdict = PDFDict();
383    $tdict->{'Prev'} = PDFNum($self->{' loc'});
384    $tdict->{'Info'} = $self->{'Info'};
385    if (defined $self->{' newroot'}) {
386        $tdict->{'Root'} = $self->{' newroot'};
387    } else {
388        $tdict->{'Root'} = $self->{'Root'};
389    }
390    $tdict->{'Size'} = $self->{'Size'};
391
392    foreach my $key (grep { $_ !~ m/^\s/ } keys %$self) {
393        $tdict->{$key} = $self->{$key} unless defined $tdict->{$key};
394    }
395
396    $fh->seek($self->{' epos'}, 0);
397    $self->out_trailer($tdict, $self->{' update'});
398    close $self->{' OUTFILE'};
399
400    return;
401} # end of append_file()
402
403=head2 $p->out_file($fname)
404
405Writes a PDF file to a file of the given filename, based on the current list of
406objects to be output. It creates the trailer dictionary based on information
407in C<$self>.
408
409$fname may be a string or an IO object.
410
411=cut
412
413sub out_file {
414    my ($self, $fname) = @_;
415
416    $self = $self->create_file($fname);
417    $self = $self->close_file();
418
419    return $self;
420}
421
422=head2 $p->create_file($fname)
423
424Creates a new output file (no check is made of an existing open file) of
425the given filename or IO object. Note: make sure that C<< $p->{' version'} >>
426is set correctly before calling this function.
427
428=cut
429
430sub create_file {
431    my ($self, $filename) = @_;
432    my $fh;
433
434    $self->{' fname'} = $filename;
435    if (ref $filename) {
436        $fh = $filename;
437    } else {
438        $fh = IO::File->new(">$filename") || die "Unable to open $filename for writing";
439        binmode($fh,':raw');
440    }
441
442    $self->{' OUTFILE'} = $fh;
443    $fh->print('%PDF-' . ($self->{' version'} || '1.4') . "\n");
444    $fh->print("%\xC6\xCD\xCD\xB5\n");   # and some binary stuff in a comment.
445
446    # PDF spec requires 4 or more "binary" bytes (128 or higher value) in a
447    # comment immediately following the PDF-x.y header, to alert reader that
448    # there is binary data. Actual values are apparently arbitrary. This DOES
449    # mean that other comments can NOT be inserted between the header and the
450    # binary comment! PDF::Builder always outputs this comment, so is always
451    # claiming binary data (no harm done?).
452
453    return $self;
454}
455
456=head2 $p->close_file()
457
458Closes up the open file for output, by outputting the trailer, etc.
459
460=cut
461
462sub close_file {
463    my $self = shift();
464
465    my $tdict = PDFDict();
466    $tdict->{'Info'} = $self->{'Info'} if defined $self->{'Info'};
467    $tdict->{'Root'} = (defined $self->{' newroot'} and $self->{' newroot'} ne '') ? $self->{' newroot'} : $self->{'Root'};
468
469    # remove all freed objects from the outlist, AND the outlist_cache if not updating
470    # NO! Don't do that thing! In fact, let out_trailer do the opposite!
471
472    $tdict->{'Size'} = $self->{'Size'} || PDFNum(1);
473    $tdict->{'Prev'} = PDFNum($self->{' loc'}) if $self->{' loc'};
474    if ($self->{' update'}) {
475        foreach my $key (grep { $_ !~ m/^[\s\-]/ } keys %$self) {
476            $tdict->{$key} = $self->{$key} unless defined $tdict->{$key};
477        }
478
479        my $fh = $self->{' INFILE'};
480        $fh->seek($self->{' epos'}, 0);
481    }
482
483    $self->out_trailer($tdict, $self->{' update'});
484    close($self->{' OUTFILE'});
485    if ($^O eq 'MacOS' and not ref($self->{' fname'})) {
486        MacPerl::SetFileInfo('CARO', 'TEXT', $self->{' fname'});
487    }
488
489    return $self;
490} # end of close_file()
491
492=head2 ($value, $str) = $p->readval($str, %opts)
493
494Reads a PDF value from the current position in the file. If C<$str> is too
495short, read some more from the current location in the file until the whole
496object is read. This is a recursive call which may slurp in a whole big stream
497(unprocessed).
498
499Returns the recursive data structure read and also the current C<$str> that has
500been read from the file.
501
502=cut
503
504sub readval {
505    my ($self, $str, %opts) = @_;
506    my $fh = $self->{' INFILE'};
507    my ($result, $value);
508
509    my $update = defined($opts{'update'}) ? $opts{'update'} : 1;
510    $str = update($fh, $str) if $update;
511
512    $str =~ s/^$ws_char+//;               # Ignore initial white space
513    $str =~ s/^\%[^\015\012]*$ws_char+//; # Ignore comments
514
515    if      ($str =~ m/^<</s) {
516        # Dictionary
517        $str = substr ($str, 2);
518        $str = update($fh, $str) if $update;
519        $result = PDFDict();
520
521        while ($str !~ m/^>>/) {
522            $str =~ s/^$ws_char+//;               # Ignore initial white space
523            $str =~ s/^\%[^\015\012]*$ws_char+//; # Ignore comments
524
525            if      ($str =~ s|^/($reg_char+)||) {
526                my $key = PDF::Builder::Basic::PDF::Name::name_to_string($1, $self);
527                ($value, $str) = $self->readval($str, %opts);
528		# per Vadim Repin (RT 131147) CHG 1. His conclusion is that
529		# it is highly unlikely, but remotely possible, that there
530		# could be legitimate use of Null objects that should NOT be
531		# prevented from bubbling up. If such a case is discovered, we
532		# might have to try Klaus Ethgen's more limited (in scope)
533		# patch in ./Pages.pm. See full discussion in RT 131147 for
534		# details on what's going on and how this fixes it.
535               #$result->{$key} = $value;  # original code
536                $result->{$key} = $value
537		   unless ref($value) eq 'PDF::Builder::Basic::PDF::Null';
538            } elsif ($str =~ s|^/$ws_char+||) {
539                # fixes a broken key problem of acrobat. -- fredo
540                ($value, $str) = $self->readval($str, %opts);
541                $result->{'null'} = $value;
542            } elsif ($str =~ s|^//|/|) {
543                # fixes again a broken key problem of illustrator/enfocus. -- fredo
544                ($value, $str) = $self->readval($str, %opts);
545                $result->{'null'} = $value;
546            } else {
547                die "Invalid dictionary key";
548            }
549            $str = update($fh, $str) if $update; # thanks gareth.jones@stud.man.ac.uk
550        }
551
552        $str =~ s/^>>//;
553        $str = update($fh, $str) if $update;
554        # streams can't be followed by a lone carriage-return.
555        # fredo: yes they can !!! -- use the MacOS, Luke.
556	    # TBD isn't this covered by $cr as space CR?
557        if (($str =~ s/^stream(?:(?:\015\012)|\012|\015)//) and ($result->{'Length'}->val() != 0)) {   # stream
558            my $length = $result->{'Length'}->val();
559            $result->{' streamsrc'} = $fh;
560            $result->{' streamloc'} = $fh->tell() - length($str);
561
562            unless ($opts{'nostreams'}) {
563                if ($length > length($str)) {
564                    $value = $str;
565                    $length -= length($str);
566                    read $fh, $str, $length + 11; # slurp the whole stream!
567                } else {
568                    $value = '';
569                }
570                $value .= substr($str, 0, $length);
571                $result->{' stream'} = $value;
572                $result->{' nofilt'} = 1;
573                $str = update($fh, $str, 1) if $update;  # tell update we are in-stream and only need an endstream
574                $str = substr($str, index($str, 'endstream') + 9);
575            }
576        }
577
578        if (defined $result->{'Type'} and defined $types{$result->{'Type'}->val()}) {
579            bless $result, $types{$result->{'Type'}->val()};
580        }
581        # gdj: FIXME: if any of the ws chars were crs, then the whole
582        # string might not have been read.
583
584    } elsif ($str =~ m/^([0-9]+)(?:$ws_char|$re_comment)+([0-9]+)(?:$ws_char|$re_comment)+R/s) {
585        # Indirect Object
586        my $num = $1;
587        $value = $2;
588        $str =~ s/^([0-9]+)(?:$ws_char|$re_comment)+([0-9]+)(?:$ws_char|$re_comment)+R//s;
589        unless ($result = $self->test_obj($num, $value)) {
590            $result = PDF::Builder::Basic::PDF::Objind->new();
591            $result->{' objnum'} = $num;
592            $result->{' objgen'} = $value;
593            $self->add_obj($result, $num, $value);
594        }
595        $result->{' parent'} = $self;
596        weaken $result->{' parent'};
597       #$result->{' realised'} = 0;
598       # removed to address changes being lost when an indirect object
599       #  is realised twice
600        # gdj: FIXME: if any of the ws chars were crs, then the whole
601        # string might not have been read.
602
603    } elsif ($str =~ m/^([0-9]+)(?:$ws_char|$re_comment)+([0-9]+)(?:$ws_char|$re_comment)+obj/s) {
604        # Object
605        my $obj;
606        my $num = $1;
607        $value = $2;
608        $str =~ s/^([0-9]+)(?:$ws_char|$re_comment)+([0-9]+)(?:$ws_char|$re_comment)+obj//s;
609        ($obj, $str) = $self->readval($str, %opts);
610        if ($result = $self->test_obj($num, $value)) {
611            $result->merge($obj);
612        } else {
613            $result = $obj;
614            $self->add_obj($result, $num, $value);
615            $result->{' realised'} = 1;
616        }
617        $str = update($fh, $str) if $update;       # thanks to kundrat@kundrat.sk
618        $str =~ s/^endobj//;
619
620    } elsif ($str =~ m|^/($reg_char*)|s) {
621        # Name
622        $value = $1;
623        $str =~ s|^/($reg_char*)||s;
624        $result = PDF::Builder::Basic::PDF::Name->from_pdf($value, $self);
625
626    } elsif ($str =~ m/^\(/) {
627        # Literal String
628        # We now need to find an unbalanced, unescaped right-paren.
629        # This can't be done with a regex.
630        my $value = '(';
631        $str = substr($str, 1);
632
633        my $nested_level = 1;
634        while (1) {
635            # Ignore everything up to the first escaped or parenthesis character
636            if ($str =~ /^([^\\()]+)(.*)/s) {
637                $value .= $1;
638                $str = $2;
639            }
640
641            # Ignore escaped parentheses
642            if      ($str =~ /^(\\[()])/) {
643                $value .= $1;
644                $str = substr($str, 2);
645
646            } elsif ($str =~ /^\(/) {
647                # Left parenthesis: increase nesting
648                $value .= '(';
649                $str = substr($str, 1);
650                $nested_level++;
651
652            } elsif ($str =~ /^\)/) {
653                # Right parenthesis: decrease nesting
654                $value .= ')';
655                $str = substr($str, 1);
656                $nested_level--;
657                last unless $nested_level;
658
659            } elsif ($str =~ /^(\\[^()])/) {
660                # Other escaped character
661                $value .= $1;
662                $str = substr($str, 2);
663
664            } else {
665                # If there wasn't an escaped or parenthesis character,
666                # read some more.
667
668                # We don't use update because we don't want to remove
669                # whitespace or comments.
670                $fh->read($str, 255, length($str)) or die 'Unterminated string.';
671            }
672        } # end while(TRUE) loop
673
674        $result = PDF::Builder::Basic::PDF::String->from_pdf($value);
675        # end Literal String check
676
677    } elsif ($str =~ m/^</) {
678        # Hex String
679        $str =~ s/^<//;
680        $fh->read($str, 255, length($str)) while (0 > index($str, '>'));
681        ($value, $str) = ($str =~ /^(.*?)>(.*)/s);
682        $result = PDF::Builder::Basic::PDF::String->from_pdf('<' . $value . '>');
683
684    } elsif ($str =~ m/^\[/) {
685        # Array
686        $str =~ s/^\[//;
687        $str = update($fh, $str) if $update;
688        $result = PDFArray();
689        while ($str !~ m/^\]/) {
690            $str =~ s/^$ws_char+//;               # Ignore initial white space
691            $str =~ s/^\%[^\015\012]*$ws_char+//; # Ignore comments
692
693            ($value, $str) = $self->readval($str, %opts);
694            $result->add_elements($value);
695            $str = update($fh, $str) if $update;   # str might just be exhausted!
696        }
697        $str =~ s/^\]//;
698
699    } elsif ($str =~ m/^(true|false)($irreg_char|$)/) {
700        # Boolean
701        $value = $1;
702        $str =~ s/^(?:true|false)//;
703        $result = PDF::Builder::Basic::PDF::Bool->from_pdf($value);
704
705    } elsif ($str =~ m/^([+-.0-9]+)($irreg_char|$)/) {
706        # Number
707        $value = $1;
708        $str =~ s/^([+-.0-9]+)//;
709
710        # If $str only consists of whitespace (or is empty), call update to
711        # see if this is the beginning of an indirect object or reference
712        if ($update and ($str =~ /^$re_whitespace*$/s or $str =~ /^$re_whitespace+[0-9]+$re_whitespace*$/s)) {
713	    $str =~ s/^$re_whitespace+/ /s;
714	    $str =~ s/$re_whitespace+$/ /s;
715            $str = update($fh, $str);
716            if ($str =~ m/^$re_whitespace*([0-9]+)$re_whitespace+(?:R|obj)/s) {
717                return $self->readval("$value $str", %opts);
718            }
719        }
720
721        $result = PDF::Builder::Basic::PDF::Number->from_pdf($value);
722
723    } elsif ($str =~ m/^null($irreg_char|$)/) {
724        # Null
725        $str =~ s/^null//;
726        $result = PDF::Builder::Basic::PDF::Null->new();
727
728    } else {
729        die "Can't parse `$str' near " . ($fh->tell()) . " length " . length($str) . ".";
730    }
731
732    $str =~ s/^$ws_char+//s;
733    return ($result, $str);
734} # end of readval()
735
736=head2 $ref = $p->read_obj($objind, %opts)
737
738Given an indirect object reference, locate it and read the object returning
739the read in object.
740
741=cut
742
743sub read_obj {
744    my ($self, $objind, %opts) = @_;
745
746    my $res = $self->read_objnum($objind->{' objnum'}, $objind->{' objgen'}, %opts) || return;
747    $objind->merge($res) unless $objind eq $res;
748
749    return $objind;
750}
751
752=head2 $ref = $p->read_objnum($num, $gen, %opts)
753
754Returns a fully read object of given number and generation in this file
755
756=cut
757
758sub read_objnum {
759    my ($self, $num, $gen, %opts) = @_;
760
761    croak 'Undefined object number in call to read_objnum($num, $gen)' unless defined $num;
762    croak 'Undefined object generation in call to read_objnum($num, $gen)' unless defined $gen;
763    croak "Invalid object number '$num' in call to read_objnum" unless $num =~ /^[0-9]+$/;
764    croak "Invalid object generation '$gen' in call to read_objnum" unless $gen =~ /^[0-9]+$/;
765
766    my $object_location = $self->locate_obj($num, $gen) || return;
767    my $object;
768
769    # Compressed object
770    if (ref($object_location)) {
771        my ($object_stream_num, $object_stream_pos) = @{$object_location};
772
773        my $object_stream = $self->read_objnum($object_stream_num, 0, %opts);
774        die 'Cannot find the compressed object stream' unless $object_stream;
775
776        $object_stream->read_stream() if $object_stream->{' nofilt'};
777
778        # An object stream starts with pairs of integers containing object numbers and
779        # stream offsets relative to the First key
780        my $fh;
781        my $pairs;
782        unless ($object_stream->{' streamfile'}) {
783            $pairs = substr($object_stream->{' stream'}, 0, $object_stream->{'First'}->val());
784        } else {
785            CORE::open($fh, '<', $object_stream->{' streamfile'});
786            read($fh, $pairs, $object_stream->{'First'}->val());
787        }
788        my @map = split /\s+/, $pairs;
789
790        # Find the offset of the object in the stream
791        my $index = $object_stream_pos * 2;
792        die "Objind $num does not exist at index $index" unless $map[$index] == $num;
793        my $start = $map[$index + 1];
794
795        # Unless this is the last object in the stream, its length is
796        # determined by the offset of the next object.
797        my $last_object_in_stream = $map[-2];
798        my $length;
799        if ($last_object_in_stream == $num) {
800            if ($object_stream->{' stream'}) {
801                $length = length($object_stream->{' stream'}) - $object_stream->{'First'}->val() - $start;
802            } else {
803                $length = (-s $object_stream->{' streamfile'}) - $object_stream->{'First'}->val() - $start;
804            }
805        } else {
806            my $next_start = $map[$index + 3];
807            $length = $next_start - $start;
808        }
809
810        # Read the object from the stream
811        my $stream = "$num 0 obj ";
812        unless ($object_stream->{' streamfile'}) {
813            $stream .= substr($object_stream->{' stream'}, $object_stream->{'First'}->val() + $start, $length);
814        } else {
815            seek($fh, $object_stream->{'First'}->val() + $start, 0);
816            read($fh, $stream, $length, length($stream));
817            close $fh;
818        }
819
820        ($object) = $self->readval($stream, %opts, update => 0);
821        return $object;
822    }
823
824    my $current_location = $self->{' INFILE'}->tell();
825    $self->{' INFILE'}->seek($object_location, 0);
826    ($object) = $self->readval('', %opts);
827    $self->{' INFILE'}->seek($current_location, 0);
828
829    return $object;
830} # end of read_objnum()
831
832=head2 $objind = $p->new_obj($obj)
833
834Creates a new, free object reference based on free space in the cross reference
835chain. If nothing is free, then think up a new number. If C<$obj>, then turns
836that object into this new object rather than returning a new object.
837
838=cut
839
840sub new_obj {
841    my ($self, $base) = @_;
842    my $res;
843
844    if (defined $self->{' free'} and scalar @{$self->{' free'}} > 0) {
845        $res = shift(@{$self->{' free'}});
846        if (defined $base) {
847            my ($num, $gen) = @{$self->{' objects'}{$res->uid()}};
848            $self->remove_obj($res);
849            $self->add_obj($base, $num, $gen);
850            return $self->out_obj($base);
851        } else {
852            $self->{' objects'}{$res->uid()}[2] = 0;
853            return $res;
854        }
855    }
856
857    my $tdict = $self;
858    my $i;
859    while (defined $tdict) {
860        $i = $tdict->{' xref'}{defined($i) ? $i : ''}[0];
861        while (defined $i and $i != 0) {
862            my ($ni, $ng) = @{$tdict->{' xref'}{$i}};
863            unless (defined $self->locate_obj($i, $ng)) {
864                if (defined $base) {
865                    $self->add_obj($base, $i, $ng);
866                    return $base;
867                } else {
868                    $res = $self->test_obj($i, $ng) || $self->add_obj(PDF::Builder::Basic::PDF::Objind->new(), $i, $ng);
869                    $self->out_obj($res);
870                    return $res;
871                }
872            }
873            $i = $ni;
874        }
875        $tdict = $tdict->{' prev'};
876    }
877
878    $i = $self->{' maxobj'}++;
879    if (defined $base) {
880        $self->add_obj($base, $i, 0);
881        $self->out_obj($base);
882        return $base;
883    } else {
884        $res = $self->add_obj(PDF::Builder::Basic::PDF::Objind->new(), $i, 0);
885        $self->out_obj($res);
886        return $res;
887    }
888}
889
890=head2 $p->out_obj($obj)
891
892Indicates that the given object reference should appear in the output xref
893table whether with data or freed.
894
895=cut
896
897sub out_obj {
898    my ($self, $obj) = @_;
899
900    # This is why we've been keeping the outlist CACHE around; to speed
901    # up this method by orders of magnitude (it saves up from having to
902    # grep the full outlist each time through as we'll just do a lookup
903    # in the hash) (which is super-fast).
904    unless (exists $self->{' outlist_cache'}{$obj}) {
905        push @{$self->{' outlist'}}, $obj;
906        # weaken $self->{' outlist'}->[-1];
907        $self->{' outlist_cache'}{$obj} = 1;
908    }
909
910    return $obj;
911}
912
913=head2 $p->free_obj($obj)
914
915Marks an object reference for output as being freed.
916
917=cut
918
919sub free_obj {
920    my ($self, $obj) = @_;
921
922    push @{$self->{' free'}}, $obj;
923    $self->{' objects'}{$obj->uid()}[2] = 1;
924    $self->out_obj($obj);
925
926    return;
927}
928
929=head2 $p->remove_obj($objind)
930
931Removes the object from all places where we might remember it.
932
933=cut
934
935sub remove_obj {
936    my ($self, $objind) = @_;
937
938    # who says it has to be fast
939    delete $self->{' objects'}{$objind->uid()};
940    delete $self->{' outlist_cache'}{$objind};
941    delete $self->{' printed_cache'}{$objind};
942    @{$self->{' outlist'}} = grep { $_ ne $objind } @{$self->{' outlist'}};
943    @{$self->{' printed'}} = grep { $_ ne $objind } @{$self->{' printed'}};
944    $self->{' objcache'}{$objind->{' objnum'}, $objind->{' objgen'}} = undef
945        if $self->{' objcache'}{$objind->{' objnum'}, $objind->{' objgen'}} eq $objind;
946
947    return $self;
948}
949
950=head2 $p->ship_out(@objects)
951
952=head2 $p->ship_out()
953
954Ships the given objects (or all objects for output if C<@objects> is empty) to
955the currently open output file (assuming there is one). Freed objects are not
956shipped, and once an object is shipped it is switched such that this file
957becomes its source and it will not be shipped again unless out_obj is called
958again. Notice that a shipped out object can be re-output or even freed, but
959that it will not cause the data already output to be changed.
960
961=cut
962
963sub ship_out {
964    my ($self, @objects) = @_;
965
966    return unless defined $self->{' OUTFILE'};
967    my $fh = $self->{' OUTFILE'};
968    seek($fh, 0, 2); # go to the end of the file
969
970    @objects = @{$self->{' outlist'}} unless scalar @objects > 0;
971    foreach my $objind (@objects) {
972        next unless $objind->is_obj($self);
973        my $j = -1;
974        for (my $i = 0; $i < scalar @{$self->{' outlist'}}; $i++) {
975            if ($self->{' outlist'}[$i] eq $objind) {
976                $j = $i;
977                last;
978            }
979        }
980        next if $j < 0;
981        splice(@{$self->{' outlist'}}, $j, 1);
982        delete $self->{' outlist_cache'}{$objind};
983        next if grep { $_ eq $objind } @{$self->{' free'}};
984
985        map { $fh->print("\%   $_ \n") } split(/$cr/, $objind->{' comments'}) if $objind->{' comments'};
986        $self->{' locs'}{$objind->uid()} = $fh->tell();
987        my ($objnum, $objgen) = @{$self->{' objects'}{$objind->uid()}}[0..1];
988        $fh->printf('%d %d obj ', $objnum, $objgen);
989        $objind->outobjdeep($fh, $self);
990        $fh->print("\nendobj\n");
991
992        # Note that we've output this obj, not forgetting to update
993        # the cache of what's printed.
994        unless (exists $self->{' printed_cache'}{$objind}) {
995            push @{$self->{' printed'}}, $objind;
996            $self->{' printed_cache'}{$objind}++;
997        }
998    }
999
1000    return $self;
1001} # end of ship_out()
1002
1003=head2 $p->copy($outpdf, \&filter)
1004
1005Iterates over every object in the file reading the object, calling C<filter>
1006with the object, and outputting the result. If C<filter> is not defined,
1007just copies input to output.
1008
1009=cut
1010
1011sub copy {
1012    my ($self, $outpdf, $filter) = @_;
1013    my ($obj, $minl, $mini, $ming);
1014
1015    foreach my $key (grep { not m/^[\s\-]/ } keys %$self) {
1016        $outpdf->{$key} = $self->{$key} unless defined $outpdf->{$key};
1017    }
1018
1019    my $tdict = $self;
1020    while (defined $tdict) {
1021        foreach my $i (sort {$a <=> $b} keys %{$tdict->{' xref'}}) {
1022            my ($nl, $ng, $nt) = @{$tdict->{' xref'}{$i}};
1023            next unless $nt eq 'n';
1024
1025            if ($nl < $minl or $mini == 0) {
1026                $mini = $i;
1027                $ming = $ng;
1028                $minl = $nl;
1029            }
1030            unless ($obj = $self->test_obj($i, $ng)) {
1031                $obj = PDF::Builder::Basic::PDF::Objind->new();
1032                $obj->{' objnum'} = $i;
1033                $obj->{' objgen'} = $ng;
1034                $self->add_obj($obj, $i, $ng);
1035                $obj->{' parent'} = $self;
1036                weaken $obj->{' parent'};
1037                $obj->{' realised'} = 0;
1038            }
1039            $obj->realise();
1040            my $res = defined $filter ? &{$filter}($obj) : $obj;
1041            $outpdf->new_obj($res) unless (!$res || $res->is_obj($outpdf));
1042        }
1043        $tdict = $tdict->{' prev'};
1044    }
1045
1046    # test for linearized and remove it from output
1047    $obj = $self->test_obj($mini, $ming);
1048    if ($obj->isa('PDF::Builder::Basic::PDF::Dict') && $obj->{'Linearized'}) {
1049        $outpdf->free_obj($obj);
1050    }
1051
1052    return $self;
1053} # end of copy()
1054
1055=head1 PRIVATE METHODS & FUNCTIONS
1056
1057The following methods and functions are considered B<private> to this class.
1058This does not mean you cannot use them if you have a need, just that they
1059aren't really designed for users of this class.
1060
1061=head2 $offset = $p->locate_obj($num, $gen)
1062
1063Returns a file offset to the object asked for by following the chain of cross
1064reference tables until it finds the one you want.
1065
1066=cut
1067
1068sub locate_obj {
1069    my ($self, $num, $gen) = @_;
1070
1071    my $tdict = $self;
1072    while (defined $tdict) {
1073        if (ref $tdict->{' xref'}{$num}) {
1074            my $ref = $tdict->{' xref'}{$num};
1075            return $ref unless scalar(@$ref) == 3;
1076
1077            if ($ref->[1] == $gen) {
1078                return $ref->[0] if $ref->[2] eq 'n';
1079                return;        # if $ref->[2] eq 'f';
1080            }
1081        }
1082        $tdict = $tdict->{' prev'};
1083    }
1084
1085    return;
1086}
1087
1088=head2 update($fh, $str, $instream)
1089
1090Keeps reading C<$fh> for more data to ensure that C<$str> has at least a line
1091full for C<readval> to work on. At this point we also take the opportunity to
1092ignore comments.
1093
1094=cut
1095
1096sub update {
1097    my ($fh, $str, $instream) = @_;
1098
1099    print STDERR 'fpos=' . tell($fh) . ' strlen=' . length($str) . "\n" if $readDebug;
1100    if ($instream) {
1101        # we are inside a (possible binary) stream
1102        # so we fetch data till we see an 'endstream'
1103        # -- fredo/2004-09-03
1104        while ($str !~ m/endstream/ and not $fh->eof()) {
1105            print STDERR 'fpos=' . tell($fh) . ' strlen=' . length($str) . "\n" if $readDebug;
1106            $fh->read($str, 314, length($str));
1107        }
1108    } else {
1109        $str =~ s/^$ws_char*//;
1110        while ($str !~ m/$cr/ and not $fh->eof()) {
1111            print STDERR 'fpos=' . tell($fh) . ' strlen=' . length($str) . "\n" if $readDebug;
1112            $fh->read($str, 314, length($str));
1113            $str =~ s/^$ws_char*//so;
1114        }
1115        while ($str =~ m/^\%/) { # restructured by fredo/2003-03-23
1116            print STDERR 'fpos=' . tell($fh) . ' strlen=' . length($str) . "\n" if $readDebug;
1117            $fh->read($str, 314, length($str)) while ($str !~ m/$cr/ and not $fh->eof());
1118            $str =~ s/^\%[^\015\012]*$ws_char*//so; # fixed for reportlab -- fredo
1119        }
1120    }
1121
1122    return $str;
1123} # end of update()
1124
1125=head2 $objind = $p->test_obj($num, $gen)
1126
1127Tests the cache to see whether an object reference (which may or may not have
1128been getobj()ed) has been cached. Returns it if it has.
1129
1130=cut
1131
1132sub test_obj {
1133    my ($self, $num, $gen) = @_;
1134
1135    return $self->{' objcache'}{$num, $gen};
1136}
1137
1138=head2 $p->add_obj($objind)
1139
1140Adds the given object to the internal object cache.
1141
1142=cut
1143
1144sub add_obj {
1145    my ($self, $obj, $num, $gen) = @_;
1146
1147    $self->{' objcache'}{$num, $gen} = $obj;
1148    $self->{' objects'}{$obj->uid()} = [$num, $gen];
1149    # weaken $self->{' objcache'}{$num, $gen};
1150    return $obj;
1151}
1152
1153=head2 $tdict = $p->readxrtr($xpos, %options)
1154
1155Recursive function which reads each of the cross-reference and trailer tables
1156in turn until there are no more.
1157
1158Returns a dictionary corresponding to the trailer chain. Each trailer also
1159includes the corresponding cross-reference table.
1160
1161The structure of the xref private element in a trailer dictionary is of an
1162anonymous hash of cross reference elements by object number. Each element
1163consists of an array of 3 elements corresponding to the three elements read
1164in [location, generation number, free or used]. See the PDF specification
1165for details.
1166
1167See C<open> for options allowed.
1168
1169=cut
1170
1171sub _unpack_xref_stream {
1172    my ($self, $width, $data) = @_;
1173
1174    # handle some oddball cases
1175    if      ($width == 3) {
1176	$data = "\x00$data";
1177	$width = 4;
1178    } elsif ($width == 5) {
1179	$data = "\x00\x00\x00$data";
1180	$width = 8;
1181    } elsif ($width == 6) {
1182	$data = "\x00\x00$data";
1183	$width = 8;
1184    } elsif ($width == 7) {
1185	$data = "\x00$data";
1186	$width = 8;
1187    }
1188    # in all cases, "Network" (Big-Endian) byte order assumed
1189    return unpack('C', $data)  if $width == 1;
1190    return unpack('n', $data)  if $width == 2;
1191    return unpack('N', $data)  if $width == 4;
1192    if ($width == 8) {
1193	# Some ways other packages handle this, without Perl-64, according
1194	# to Vadim Repin. Possibly they end up converting the value to
1195	# "double" behind the scenes if on a 32-bit platform?
1196	# PDF::Tiny  return hex unpack('H16', $data);
1197	# CAM::PDF   my @b = unpack('C*', $data);
1198	#            my $i=0; ($i <<= 8) += shift @b while @b; return $i;
1199
1200	if (substr($data, 0, 4) eq "\x00\x00\x00\x00") {
1201	    # can treat as 32 bit unsigned int
1202            return unpack('N', substr($data, 4, 4));
1203	} else {
1204            # requires 64-bit platform (chip and Perl), else fatal error
1205	    # it may blow up and produce a smoking crater if 32-bit Perl!
1206            # also note that Q needs Big-Endian flag (>) specified, else
1207	    #   it will use the native chip order (Big- or Little- Endian)
1208            return unpack('Q>', $data);
1209	}
1210    }
1211
1212    die "Unsupported field width: $width. 1-8 supported.";
1213}
1214
1215sub readxrtr {
1216    my ($self, $xpos, %options) = @_;
1217    # $xpos SHOULD be pointing to "xref" keyword
1218    my ($tdict, $buf, $xmin, $xnum, $xdiff);
1219
1220    my $fh = $self->{' INFILE'};
1221    $fh->seek($xpos, 0);
1222    $fh->read($buf, 22);  # 22 should overlap into first subsection
1223    $buf = update($fh, $buf); # fix for broken JAWS xref calculation.
1224
1225    my $xlist = {};
1226
1227   ## it seems that some products calculate wrong prev entries (short)
1228   ## so we seek ahead to find one -- fredo; save for now
1229   #while ($buf !~ m/^xref$cr/i && !eof($fh)) {
1230   #    $buf =~ s/^(\s+|\S+|.)//i;
1231   #    $buf = update($fh, $buf);
1232   #}
1233
1234    if ($buf =~ s/^xref$cr//i) {   # remove xrefEOL from buffer
1235        # Plain XRef tables.
1236        #
1237        # look to match startobj# count# EOL of first (or only) subsection
1238        # supposed to be single ASCII space between numbers, but this is
1239        #   more lenient for some writers, allowing 1 or more whitespace
1240        my $subsection_count = 0;
1241        my $entry_format_error = 0;
1242        my $xrefListEmpty = 0;
1243
1244        while ($buf =~ m/^$ws_char*([0-9]+)$ws_char+([0-9]+)$ws_char*$cr(.*?)$/s) {
1245            my $old_buf = $buf;
1246            $xmin = $1;   # starting object number of this subsection
1247            $xnum = $2;   # number of entries in this subsection
1248            $buf  = $3;   # remainder of buffer
1249            $subsection_count++;
1250            # go back and warn if other than single space separating numbers
1251            unless ($old_buf =~ /^[0-9]+ [0-9]+$cr/) {  #orig 'warn'
1252		if ($options{'-diags'} == 1) {
1253                    # See PDF 1.7 section 7.5.4: Cross-Reference Table
1254                    warn "Malformed xref: subsection header needs a single\n" .
1255                         "ASCII space between the numbers and no extra spaces.\n";
1256	        }
1257            }
1258            $xdiff = length($buf); # how much remaining in buffer
1259
1260            # in case xnum == 0 is permitted (or used and tolerated by readers),
1261            #   skip over entry reads and go to next subsection
1262            if ($xnum < 1) {
1263                if ($options{'-diags'} == 1) {
1264                    warn "Xref subsection has 0 entries. Skipped.\n";
1265		}
1266                $xrefListEmpty = 1;
1267                next;
1268            }
1269
1270            # read chunk of entire subsection list
1271            my $entry_size = 20;
1272            # test read first entry, see if $cr in expected place, adjust size
1273            $fh->read($buf, $entry_size * 1 - $xdiff + 15, $xdiff);
1274            if ($buf =~ m/^(.*?)$cr/) {
1275                $entry_size = length($1) + 2;
1276	    }
1277            if ($entry_size != 20 && $options{'-diags'} == 1) {
1278                warn "Xref entries supposed to be 20 bytes long, are $entry_size.\n";
1279            }
1280            $xdiff = length($buf);
1281
1282            # read remaining entries
1283            $fh->read($buf, $entry_size * $xnum - $xdiff + 15, $xdiff);
1284            # each entry is two integers and flag. spec says single ASCII space
1285            #   between each field and certain length for each (10, 5, 1), so
1286            #   this appears to be more lenient than spec
1287            # is object 0 supposed to be in subsection 1, or is any place OK?
1288            while ($xnum-- > 0 and
1289                   $buf =~ m/^$ws_char*(\d+)$ws_char+(\d+)$ws_char+([nf])$ws_char*$cr/) {
1290                # check if format doesn't match spec
1291                if ($buf =~ m/^\d{10} \d{5} [nf]$cr/ ||
1292                    $entry_format_error) {
1293                    # format OK or have already reported format problem
1294                } else {
1295		    if ($options{'-diags'} == 1) {
1296                        warn "Xref entry readable, but doesn't meet PDF spec.\n";
1297                    }
1298                    $entry_format_error++;
1299                }
1300
1301                $buf =~ s/^$ws_char*(\d+)$ws_char+(\d+)$ws_char+([nf])$ws_char*$cr//;
1302                # $1 = object's starting offset in file (n) or
1303                #      next object in free list (f) [0 if last]
1304                # $2 = generation number (n) or 65535 for object 0 (f) or
1305                #      next generation number (f)
1306                # $3 = flag (n = object in use, f = free)
1307                # buf reduced by entry just processed
1308                if (exists $xlist->{$xmin}) {
1309                    if ($options{'-diags'} == 1) {
1310                        warn "Duplicate object number $xmin in xref table ignored.\n";
1311		    }
1312                } else {
1313                    $xlist->{$xmin} = [$1, $2, $3];
1314                    if ($xmin == 0 && $subsection_count > 1 && $options{'-diags'} == 1) {
1315                        warn "Xref object 0 entry not in first subsection.\n";
1316                    }
1317                }
1318                $xmin++;
1319            } # traverse one subsection for objects xmin through xmin+xnum-1
1320            # go back for next subsection (if any)
1321        } # loop through xref subsections
1322        # fall through to here when run out of xref subsections
1323        # xlist should have two or more object entries, may not be contiguous
1324
1325        # should have an object 0
1326        #   at this point, no idea if object 0 was in first subsection (legal?)
1327        #   could attempt a fixup if no object 0 found. many fixups are quite
1328        #     risky and could end up corrupting the free list.
1329        #   there's no guarantee that a proper free list will result, but any
1330        #     error should hopefully be caught further on
1331        if (!exists $xlist->{'0'} && !$xrefListEmpty) {
1332            # for now, 1 subsection starting with 1, and only object 1 in
1333            #   free list, try to fix up
1334            if ($subsection_count == 1 && exists $xlist->{'1'}) {
1335                # apparently a common enough error in PDF writers
1336
1337                if ($xlist->{'1'}[0] == 0 &&  # only member of free list
1338                    $xlist->{'1'}[1] == 65535 &&
1339                    $xlist->{'1'}[2] eq 'f') {
1340                    # object 1 appears to be the free list head, so shift
1341                    #   down all objects
1342		    if ($options{'-diags'} == 1) {
1343                        warn "xref appears to be mislabeled starting with 1. Shift down all elements.\n";
1344		    }
1345                    my $next = 1;
1346                    while (exists $xlist->{$next}) {
1347                        $xlist->{$next - 1} = $xlist->{$next};
1348                        $next++;
1349                    }
1350                    delete $xlist->{--$next};
1351
1352                } else {
1353                    # if object 1 does not appear to be a free list head,
1354                    #   insert a new object 0
1355		    if ($options{'-diags'} == 1) {
1356                        warn "Xref appears to be missing object 0. Insert a new one.\n";
1357		    }
1358                    $xlist->{'0'} = [0, 65535, 'f'];
1359                }
1360            } else {
1361		if ($options{'-diags'} == 1) {
1362                    warn "Malformed cross reference list in PDF file $self->{' fname'} -- no object 0 (free list head)\n";
1363		}
1364                $xlist->{'0'} = [0, 65535, 'f'];
1365            }
1366        } # no object 0 entry
1367
1368        # build/validate the free list (and no active objects have f flag)
1369        my @free_list;
1370        foreach (sort {$a <=> $b} keys %{ $xlist }) {
1371            # if 'f' flag, is in free list
1372            if      ($xlist->{$_}[2] eq 'f') {
1373                if ($xlist->{$_}[1] <= 0 && $options{'-diags'} == 1) {
1374                    warn "Xref free list entry $_ with bad next generation number.\n";
1375                } else {
1376                    push @free_list, $_; # should be in numeric order (0 first)
1377                }
1378            } elsif ($xlist->{$_}[2] eq 'n') {
1379                if ($xlist->{$_}[0] <= 0 && $options{'-diags'} == 1) {
1380                    warn "Xref active object $_ entry with bad length ".($xlist->{$_}[1])."\n";
1381                }
1382                if ($xlist->{$_}[1] < 0 && $options{'-diags'} == 1) {
1383                    warn "Xref active object $_ entry with bad generation number ".($xlist->{$_}[1])."\n";
1384                }
1385            } else {
1386		if ($options{'-diags'} == 1) {
1387                    warn "Xref entry has flag that is not 'f' or 'n'.\n";
1388	        }
1389            }
1390        } # go through xlist and build free_list and check entries
1391        # traverse free list and check that "next object" is also in free list
1392        my $next_free = 0;  # object 0 should always be in free list
1393        if ($xlist->{'0'}[1] != 65535 && $options{'-diags'} == 1) {
1394            warn "Object 0 next generation is not 65535.\n";
1395        }
1396        do {
1397            if ($xlist->{$next_free}[2] ne 'f') {
1398                if ($options{'-diags'} == 1) {
1399                    warn "Corrupted free object list: next=$next_free is not a free object.\n";
1400		}
1401                $next_free = 0; # force end of free list
1402            } else {
1403                $next_free = $xlist->{$next_free}[0];
1404            }
1405            # remove this entry from free list array
1406            splice(@free_list, index(@free_list, $next_free), 1);
1407        } while ($next_free && exists $xlist->{$next_free});
1408        if (scalar @free_list && $options{'-diags'} == 1) {
1409            warn "Corrupted xref list: object(s) @free_list marked as free, but are not in free chain.\n";
1410        }
1411
1412        # done with cross reference table, so go on to trailer
1413        if ($buf !~ /^\s*trailer\b/i && $options{'-diags'} == 1) {  #orig 'die'
1414            warn "Malformed trailer in PDF file $self->{' fname'} at " . ($fh->tell() - length($buf));
1415        }
1416
1417        $buf =~ s/^\s*trailer\b//i;
1418
1419        ($tdict, $buf) = $self->readval($buf);
1420
1421    } elsif ($buf =~ m/^(\d+)\s+(\d+)\s+obj/i) {
1422        my ($xref_obj, $xref_gen) = ($1, $2);
1423
1424	PDF::Builder->verCheckOutput(1.5, "importing cross-reference stream");
1425        # XRef streams
1426        ($tdict, $buf) = $self->readval($buf);
1427
1428        unless ($tdict->{' stream'}) {
1429	    if ($options{'-diags'} == 1) {
1430                warn "Malformed XRefStm at $xref_obj $xref_gen obj in PDF file $self->{' fname'}";
1431	    }
1432        }
1433        $tdict->read_stream(1);
1434
1435        my $stream = $tdict->{' stream'};
1436        my @widths = map { $_->val() } @{$tdict->{'W'}->val()};
1437
1438        my $start = 0;
1439        my $last;
1440
1441        my @index;
1442        if (defined $tdict->{'Index'}) {
1443            @index = map { $_->val() } @{$tdict->{'Index'}->val()};
1444        } else {
1445            @index = (0, $tdict->{'Size'}->val());
1446        }
1447
1448        while (scalar @index) {
1449            $start = shift(@index);
1450            $last = $start + shift(@index) - 1;
1451
1452            for my $i ($start...$last) {
1453                # Replaced "for $xmin" because it creates a loop-specific local
1454                # variable, and we need $xmin to be correct for maxobj below.
1455                $xmin = $i;
1456
1457                my @cols;
1458
1459                for my $w (@widths) {
1460                    my $data;
1461                    $data = $self->_unpack_xref_stream($w, substr($stream, 0, $w, '')) if $w;
1462
1463                    push @cols, $data;
1464                }
1465
1466                $cols[0] = 1 unless defined $cols[0];
1467                if ($cols[0] > 2 && $options{'-diags'} == 1) {
1468                    warn "Invalid XRefStm entry type ($cols[0]) at $xref_obj $xref_gen obj";
1469                }
1470
1471                next if exists $xlist->{$xmin};
1472
1473                my @objind = ($cols[1], defined($cols[2]) ? $cols[2] : ($xmin ? 0 : 65535));
1474                push @objind, ($cols[0] == 0? 'f': 'n') if $cols[0] < 2;
1475
1476                $xlist->{$xmin} = \@objind;
1477            }
1478        }
1479
1480    } else {  #orig 'die'
1481	if ($options{'-diags'} == 1) {
1482            warn "Malformed xref in PDF file $self->{' fname'}";
1483	}
1484    }
1485
1486    # did we get to here without managing to set $xmin?
1487    $xmin ||= 0;
1488
1489    $tdict->{' loc'} = $xpos;
1490    $tdict->{' xref'} = $xlist;
1491    $self->{' maxobj'} = $xmin + 1 if $xmin + 1 > $self->{' maxobj'};
1492    $tdict->{' prev'} = $self->readxrtr($tdict->{'Prev'}->val(), %options)
1493        if (defined $tdict->{'Prev'} and $tdict->{'Prev'}->val() != 0);
1494    delete $tdict->{' prev'} unless defined $tdict->{' prev'};
1495
1496    return $tdict;
1497} # end of readxrtr()
1498
1499=head2 $p->out_trailer($tdict, $update)
1500
1501=head2 $p->out_trailer($tdict)
1502
1503Outputs the body and trailer for a PDF file by outputting all the objects in
1504the ' outlist' and then outputting a xref table for those objects and any
1505freed ones. It then outputs the trailing dictionary and the trailer code.
1506
1507=cut
1508
1509sub out_trailer {
1510    my ($self, $tdict, $update) = @_;
1511
1512    my $fh = $self->{' OUTFILE'};
1513
1514    while (@{$self->{' outlist'}}) {
1515        $self->ship_out();
1516    }
1517
1518    #    $size = @{$self->{' printed'}} + @{$self->{' free'}};
1519    #    $tdict->{'Size'} = PDFNum($tdict->{'Size'}->val() + $size);
1520    # PDFSpec 1.3 says for /Size: (Required) Total number of entries in the file's
1521    # cross-reference table, including the original table and all updates. Which
1522    # is what the previous two lines implement.
1523    # But this seems to make Acrobat croak on saving so we try the following from
1524    # basil.duval@epfl.ch
1525    $tdict->{'Size'} = PDFNum($self->{' maxobj'});
1526
1527    my $tloc = $fh->tell();
1528##  $fh->print("xref\n");
1529    # instead of directly outputting (fh->print) xreflist, we accumulate in @out
1530    my @out;
1531    my @xreflist = sort { $self->{' objects'}{$a->uid()}[0] <=> $self->{' objects'}{$b->uid()}[0] } (@{$self->{' printed'} || []}, @{$self->{' free'} || []});
1532
1533    my ($i, $j, $k);
1534    unless ($update) {
1535        $i = 1;
1536        for ($j = 0; $j < @xreflist; $j++) {
1537            my @inserts;
1538            $k = $xreflist[$j];
1539            while ($i < $self->{' objects'}{$k->uid()}[0]) {
1540                my ($n) = PDF::Builder::Basic::PDF::Objind->new();
1541                $self->add_obj($n, $i, 0);
1542                $self->free_obj($n);
1543                push(@inserts, $n);
1544                $i++;
1545            }
1546            splice(@xreflist, $j, 0, @inserts);
1547            $j += @inserts;
1548            $i++;
1549        }
1550    }
1551
1552    my @freelist = sort { $self->{' objects'}{$a->uid()}[0] <=> $self->{' objects'}{$b->uid()}[0] } @{$self->{' free'} || []};
1553
1554    $j = 0; my $first = -1; $k = 0;
1555    for ($i = 0; $i <= $#xreflist + 1; $i++) {
1556        # if ($i == 0) {
1557        #     $first = $i; $j = $xreflist[0]->{' objnum'};
1558        #     $fh->printf("0 1\n%010d 65535 f \n", $ff);
1559        # }
1560        if ($i > $#xreflist || $self->{' objects'}{$xreflist[$i]->uid()}[0] != $j + 1) {
1561##          $fh->print(($first == -1 ? "0 " : "$self->{' objects'}{$xreflist[$first]->uid()}[0] ") . ($i - $first) . "\n");
1562            push @out, ($first == -1 ? "0 " : "$self->{' objects'}{$xreflist[$first]->uid()}[0] ") . ($i - $first) . "\n";
1563            if ($first == -1) {
1564##              $fh->printf("%010d 65535 f \n", defined $freelist[$k] ? $self->{' objects'}{$freelist[$k]->uid()}[0] : 0);
1565                push @out, sprintf("%010d 65535 f \n", defined $freelist[$k] ? $self->{' objects'}{$freelist[$k]->uid()}[0] : 0);
1566                $first = 0;
1567            }
1568            for ($j = $first; $j < $i; $j++) {
1569                my $xref = $xreflist[$j];
1570                if (defined $freelist[$k] && defined $xref && "$freelist[$k]" eq "$xref") {
1571                    $k++;
1572##                  $fh->print(pack("A10AA5A4",
1573                    push(@out, pack("A10AA5A4",
1574                                    sprintf("%010d", (defined $freelist[$k] ?
1575                                                      $self->{' objects'}{$freelist[$k]->uid()}[0] : 0)), " ",
1576                                    sprintf("%05d", $self->{' objects'}{$xref->uid()}[1] + 1),
1577                                    " f \n"));
1578                } else {
1579##                  $fh->print(pack("A10AA5A4", sprintf("%010d", $self->{' locs'}{$xref->uid()}), " ",
1580                    push(@out, pack("A10AA5A4", sprintf("%010d", $self->{' locs'}{$xref->uid()}), " ",
1581                            sprintf("%05d", $self->{' objects'}{$xref->uid()}[1]),
1582                            " n \n"));
1583                }
1584            }
1585            $first = $i;
1586            $j = $self->{' objects'}{$xreflist[$i]->uid()}[0] if ($i < scalar @xreflist);
1587
1588        } else {
1589            $j++;
1590        }
1591    } # end for loop through xreflists
1592##  $fh->print("trailer\n");
1593##  $tdict->outobjdeep($fh, $self);
1594##  $fh->print("\nstartxref\n$tloc\n%%EOF\n");
1595## start new code for 117184 fix by Vadim. @out has array of xref content
1596    if (exists $tdict->{'Type'} and $tdict->{'Type'}->val() eq 'XRef') {
1597
1598        my (@index, @stream);
1599        for (@out) {  # @out is the accumulated cross reference list
1600            my @a = split;
1601            @a == 2 ? push @index, @a : push @stream, \@a;
1602        }
1603        my $i = $self->{' maxobj'}++;
1604        $self->add_obj($tdict, $i, 0);
1605        $self->out_obj($tdict);
1606
1607        push @index, $i, 1;
1608        push @stream, [ $tloc, 0, 'n' ];
1609
1610        my $len = $tloc > 0xFFFF ? 4 : 2;           # don't expect files > 4 Gb
1611        my $tpl = $tloc > 0xFFFF ? 'CNC' : 'CnC';   # don't expect gennum > 255, it's absurd.
1612                                                    # Adobe doesn't use them anymore anyway
1613        my $sstream = '';
1614        my @prev = ( 0 ) x ( $len + 2 );  # init prev to all 0's
1615        for (@stream) {
1616	    # OK to zero out gennum of 65535 for a cross reference stream,
1617	    # rather than just truncating to 255 -- Vadim
1618	    $_->[ 1 ] = 0 if $_->[ 1 ] == 65535 and
1619	                     $_->[ 2 ] eq 'f';
1620            # make sure is 0..255, since will pack with 'C' code -- Phil
1621	    if ($_->[1] > 0xFF) {
1622		print "generation number ".($_->[1])." in entry '$_->[0] $_->[1] $_->[2]' exceeds 256, reduced to ".($_->[1] & 0x00FF)."\n";
1623	    }
1624	    $_->[ 1 ] &= 0x00FF;
1625            my @line = unpack 'C*', pack $tpl, $_->[ 2 ] eq 'n'? 1 : 0, @{ $_ }[ 0 .. 1 ];
1626
1627            $sstream .= pack 'C*', 2, # prepend filtering method, "PNG Up"
1628                map {($line[ $_ ] - $prev[ $_ ] + 256) % 256} 0 .. $#line;
1629            @prev    = @line;
1630        }
1631	# build a dictionary for the cross reference stream
1632        $tdict->{'Size'} = PDFNum($i + 1);
1633        $tdict->{'Index'} = PDFArray(map { PDFNum($_) } @index);
1634        $tdict->{'W'} = PDFArray(map { PDFNum($_) } 1, $len, 1);
1635        $tdict->{'Filter'} = PDFName('FlateDecode');
1636
1637	# it's compressed
1638        $tdict->{'DecodeParms'} = PDFDict();
1639        $tdict->{'DecodeParms'}->val()->{'Predictor'} = PDFNum(12);
1640        $tdict->{'DecodeParms'}->val()->{'Columns'} = PDFNum($len + 2);
1641
1642        $sstream = PDF::Builder::Basic::PDF::Filter::FlateDecode->new()->outfilt($sstream, 1);
1643        $tdict->{' stream'} = $sstream;
1644        $tdict->{' nofilt'} = 1;
1645        delete $tdict->{'Length'};
1646        $self->ship_out();
1647    } else {
1648	# delete may be moved later by Vadim closer to where XRefStm created
1649	delete $tdict->{'XRefStm'};
1650	# almost the original code
1651        $fh->print("xref\n", @out, "trailer\n");
1652        $tdict->outobjdeep($fh, $self);
1653        $fh->print("\n");
1654    }
1655    $fh->print("startxref\n$tloc\n%%EOF\n");
1656## end of new code
1657
1658    return;
1659} # end of out_trailer()
1660
1661=head2 PDF::Builder::Basic::PDF::File->_new()
1662
1663Creates a very empty PDF file object (used by new() and open())
1664
1665=cut
1666
1667sub _new {
1668    my $class = shift();
1669    my $self = {};
1670
1671    bless $self, $class;
1672    $self->{' outlist'}       = [];
1673    $self->{' outlist_cache'} = {};     # A cache of what's in the 'outlist'
1674    $self->{' maxobj'}        = 1;
1675    $self->{' objcache'}      = {};
1676    $self->{' objects'}       = {};
1677
1678    return $self;
1679}
1680
16811;
1682
1683=head1 AUTHOR
1684
1685Martin Hosken Martin_Hosken@sil.org
1686
1687Copyright Martin Hosken 1999 and onwards
1688
1689No warranty or expression of effectiveness, least of all regarding anyone's
1690safety, is implied in this software or documentation.
1691