1#======================================================================= 2# 3# THIS IS A REUSED PERL MODULE, FOR PROPER LICENCING TERMS SEE BELOW: 4# 5# Copyright Martin Hosken <Martin_Hosken@sil.org> 6# 7# Modified for PDF::API2 by Alfred Reibenschuh <alfredreibenschuh@gmx.net> 8# 9# No warranty or expression of effectiveness, least of all regarding 10# anyone's safety, is implied in this software or documentation. 11# 12# This specific module is licensed under the Perl Artistic License. 13# Effective 28 January 2021, the original author and copyright holder, 14# Martin Hosken, has given permission to use and redistribute this module 15# under the MIT license. 16# 17#======================================================================= 18package PDF::Builder::Basic::PDF::File; 19 20use strict; 21use warnings; 22 23our $VERSION = '3.023'; # VERSION 24our $LAST_UPDATE = '3.023'; # manually update whenever code is changed 25 26=head1 NAME 27 28PDF::Builder::Basic::PDF::File - Holds the trailers and cross-reference tables for a PDF file 29 30=head1 SYNOPSIS 31 32 $p = PDF::Builder::Basic::PDF::File->open("filename.pdf", 1); 33 $p->new_obj($obj_ref); 34 $p->free_obj($obj_ref); 35 $p->append_file(); 36 $p->close_file(); 37 $p->release(); # IMPORTANT! 38 39=head1 DESCRIPTION 40 41This class keeps track of the directory aspects of a PDF file. There are two 42parts to the directory: the main directory object, which is the parent to all 43other objects, and a chain of cross-reference tables and corresponding trailer 44dictionaries, starting with the main directory object. 45 46=head1 INSTANCE VARIABLES 47 48Within this class hierarchy, rather than making everything visible via methods, 49which would be a lot of work, there are various instance variables which are 50accessible via associative array referencing. To distinguish instance variables 51from content variables (which may come from the PDF content itself), each such 52variable will start with a space. 53 54Variables which do not start with a space directly reflect elements in a PDF 55dictionary. In the case of a C<PDF::Builder::Basic::PDF::File>, the elements 56reflect those in the trailer dictionary. 57 58Since some variables are not designed for class users to access, variables are 59marked in the documentation with B<(R)> to indicate that such an entry should 60only be used as B<read-only> information. B<(P)> indicates that the information 61is B<private>, and not designed for user use at all, but is included in the 62documentation for completeness and to ensure that nobody else tries to use it. 63 64=over 65 66=item newroot 67 68This variable allows the user to create a new root entry to occur in the trailer 69dictionary which is output when the file is written or appended. If you wish to 70override the root element in the dictionary you have, use this entry to indicate 71that without losing the current Root entry. Notice that newroot should point to 72a PDF level object and not just to a dictionary which does not have object 73status. 74 75=item INFILE (R) 76 77Contains the filehandle used to read this information into this PDF directory. 78It is an IO object. 79 80=item fname (R) 81 82This is the filename which is reflected by INFILE, or the original IO object 83passed in. 84 85=item update (R) 86 87This indicates that the read file has been opened for update and that at some 88point, C<< $p->appendfile() >> can be called to update the file with the 89changes that have been made to the memory representation. 90 91=item maxobj (R) 92 93Contains the first usable object number above any that have already appeared 94in the file so far. 95 96=item outlist (P) 97 98This is a list of Objind which are to be output when the next C<appendfile()> 99or C<outfile()> occurs. 100 101=item firstfree (P) 102 103Contains the first free object in the free object list. Free objects are removed 104from the front of the list and added to the end. 105 106=item lastfree (P) 107 108Contains the last free object in the free list. It may be the same as the 109C<firstfree> if there is only one free object. 110 111=item objcache (P) 112 113All objects are held in the cache to ensure that a system only has one 114occurrence of each object. In effect, the objind class acts as a container type 115class to hold the PDF object structure, and it would be unfortunate if there 116were two identical place-holders floating around a system. 117 118=item epos (P) 119 120The end location of the read-file. 121 122=back 123 124Each trailer dictionary contains a number of private instance variables which 125hold the chain together. 126 127=over 128 129=item loc (P) 130 131Contains the location of the start of the cross-reference table preceding the 132trailer. 133 134=item xref (P) 135 136Contains an anonymous array of each cross-reference table entry. 137 138=item prev (P) 139 140A reference to the previous table. Note this differs from the Prev entry which 141is in PDF, which contains the location of the previous cross-reference table. 142 143=back 144 145=head1 METHODS 146 147=cut 148 149use Scalar::Util qw(blessed weaken); 150 151use vars qw($cr $irreg_char $reg_char $ws_char $delim_char %types); 152 153$ws_char = '[ \t\r\n\f\0]'; 154$delim_char = '[][<>{}()/%]'; 155$reg_char = '[^][<>{}()/% \t\r\n\f\0]'; 156$irreg_char = '[][<>{}()/% \t\r\n\f\0]'; 157# \015 = x0D = CR or \r, \012 = x0A = LF or \n 158# TBD a line-end character is space CR ' \r', space LF ' \n', or CR LF '\r\n' 159# have seen working PDFs with just a CR and space CR 160$cr = '\s*(?:\015|\012|(?:\015\012))'; 161 162my $re_comment = qr/(?:\%[^\r\n]*)/; 163my $re_whitespace = qr/(?:[ \t\r\n\f\0]|$re_comment)/; 164 165%types = ( 166 'Page' => 'PDF::Builder::Basic::PDF::Page', 167 'Pages' => 'PDF::Builder::Basic::PDF::Pages', 168); 169 170my $readDebug = 0; 171 172use Carp; 173use IO::File; 174 175# Now for the basic PDF types 176use PDF::Builder::Basic::PDF::Utils; 177 178use PDF::Builder::Basic::PDF::Array; 179use PDF::Builder::Basic::PDF::Bool; 180use PDF::Builder::Basic::PDF::Dict; 181use PDF::Builder::Basic::PDF::Name; 182use PDF::Builder::Basic::PDF::Number; 183use PDF::Builder::Basic::PDF::Objind; 184use PDF::Builder::Basic::PDF::String; 185use PDF::Builder::Basic::PDF::Page; 186use PDF::Builder::Basic::PDF::Pages; 187use PDF::Builder::Basic::PDF::Null; 188use POSIX qw(ceil floor); 189 190=head2 PDF::Builder::Basic::PDF::File->new() 191 192Creates a new, empty file object which can act as the host to other PDF objects. 193Since there is no file associated with this object, it is assumed that the 194object is created in readiness for creating a new PDF file. 195 196=cut 197 198sub new { 199 my ($class, $root) = @_; 200 my $self = $class->_new(); 201 202 unless ($root) { 203 $root = PDFDict(); 204 $root->{'Type'} = PDFName('Catalog'); 205 } 206 $self->new_obj($root); 207 $self->{'Root'} = $root; 208 209 return $self; 210} 211 212=head2 $p = PDF::Builder::Basic::PDF::File->open($filename, $update, %options) 213 214Opens the file and reads all the trailers and cross reference tables to build 215a complete directory of objects. 216 217C<$filename> may be a string or an IO object. 218 219C<$update> specifies whether this file is being opened for updating and editing 220(I<TRUE> value), or simply to be read (I<FALSE> or undefined value). 221 222C<%options> may include 223 224=over 225 226=item -diags => 1 227 228If C<-diags> is set to 1, various warning messages will be given if a 229suspicious PDF structure is found, and some fixup may be attempted. There is 230no guarantee that any fixup will change the PDF to legitimate, or that there 231won't be other problems found further down the line. If this flag is I<not> 232given, and a structural problem is found, it is fairly likely that errors (and 233even a program B<crash>) may happen further along. If you experience crashes 234when reading in a PDF file, try running with C<-diags> and see what is reported. 235 236There are many PDF files out "in the wild" which, while failing to conform to 237Adobe's standards, appear to be tolerated by PDF Readers. Thus, Builder will no 238longer fail on them, but merely comment on their existence. 239 240=back 241 242=cut 243 244sub open { 245 my ($class, $filename, $update, %options) = @_; 246 my ($fh, $buffer); 247 $options{'-diags'} = 0 if not defined $options{'-diags'}; # default 248 249 my $comment = ''; # any comment jammed into the PDF header 250 my $self = $class->_new(); 251 if (ref $filename) { 252 $self->{' INFILE'} = $filename; 253 if ($update) { 254 $self->{' update'} = 1; 255 $self->{' OUTFILE'} = $filename; 256 } 257 $fh = $filename; 258 } else { 259 die "File '$filename' does not exist!" unless -f $filename; 260 $fh = IO::File->new(($update ? '+' : '') . "<$filename") || return; 261 $self->{' INFILE'} = $fh; 262 if ($update) { 263 $self->{' update'} = 1; 264 $self->{' OUTFILE'} = $fh; 265 $self->{' fname'} = $filename; 266 } 267 } 268 binmode $fh, ':raw'; 269 $fh->seek(0, 0); # go to start of file 270 $fh->read($buffer, 255); 271 unless ($buffer =~ m/^\%PDF\-(\d+\.\d+)(.*?)$cr/mo) { 272 die "$filename does not contain a PDF version"; 273 } 274 $self->{' version'} = $1; 275 # can't run verCheckInput() yet, as full ' version' not set 276 if (defined $2 && length($2) > 0) { 277 $comment = $2; # save for output as comment 278 # since we just echo the original header + comment, unless that causes 279 # problems in some Readers, we can just leave it be (no call to strip 280 # out inline comment and create a separate comment further along). 281 } 282 283 $fh->seek(0, 2); # go to end of file 284 my $end = $fh->tell(); 285 $self->{' epos'} = $end; 286 foreach my $offset (1 .. 64) { 287 $fh->seek($end - 16 * $offset, 0); 288 $fh->read($buffer, 16 * $offset); 289 last if $buffer =~ m/startxref($cr|\s*)\d+($cr|\s*)\%\%eof.*?/i; 290 } 291 unless ($buffer =~ m/startxref[^\d]+([0-9]+)($cr|\s*)\%\%eof.*?/i) { 292 if ($options{'-diags'} == 1) { 293 warn "Malformed PDF file $filename"; #orig 'die' 294 } 295 } 296 my $xpos = $1; 297 $self->{' xref_position'} = $xpos; 298 299 my $tdict = $self->readxrtr($xpos, %options); 300 foreach my $key (keys %$tdict) { 301 $self->{$key} = $tdict->{$key}; 302 } 303 304 return $self; 305} # end of open() 306 307=head2 $p->release() 308 309Releases ALL of the memory used by the PDF document and all of its 310component objects. After calling this method, do B<NOT> expect to 311have anything left in the C<PDF::Builder::Basic::PDF::File> object 312(so if you need to save, be sure to do it before calling this method). 313 314B<NOTE>, that it is important that you call this method on any 315C<PDF::Builder::Basic::PDF::File> object when you wish to destroy it and 316free up its memory. Internally, PDF files have an enormous number of 317cross-references, and this causes circular references within the 318internal data structures. Calling C<release()> causes a brute-force 319cleanup of the data structures, freeing up all of the memory. Once 320you've called this method, though, don't expect to be able to do 321anything else with the C<PDF::Builder::Basic::PDF::File> object; it'll 322have B<no> internal state whatsoever. 323 324=cut 325 326# Maintainer's Question: Couldn't this be handled by a DESTROY method 327# instead of requiring an explicit call to release()? 328sub release { 329 my $self = shift(); 330 331 return $self unless ref($self); 332 my @tofree = values %$self; 333 334 foreach my $key (keys %$self) { 335 $self->{$key} = undef; 336 delete $self->{$key}; 337 } 338 339 while (my $item = shift @tofree) { 340 if (blessed($item) and $item->can('release')) { 341 $item->release(1); 342 } elsif (ref($item) eq 'ARRAY') { 343 push @tofree, @$item; 344 } elsif (ref($item) eq 'HASH') { 345 push @tofree, values %$item; 346 foreach my $key (keys %$item) { 347 $item->{$key} = undef; 348 delete $item->{$key}; 349 } 350 } else { 351 $item = undef; 352 } 353 } 354 355 return; 356} # end of release() 357 358=head2 $p->append_file() 359 360Appends the objects for output to the read file and then appends the 361appropriate table. 362 363=cut 364 365sub append_file { 366 my $self = shift(); 367 return unless $self->{' update'}; 368 369 my $fh = $self->{' INFILE'}; 370 371 # hack to upgrade pdf-version number to support requested features in 372 # higher versions than the pdf was originally created. WARNING: new version 373 # must be exactly SAME length as the old (e.g., 1.6 replacing 1.4), or 374 # problems are likely with overwriting header. perhaps some day we will 375 # need to check the old version being ovewritten, and adjust something to 376 # avoid corrupting the file. 377 my $version = $self->{' version'} || 1.4; 378 $fh->seek(0, 0); 379 # assume that any existing EOL after version will be reused 380 $fh->print("%PDF-$version"); 381 382 my $tdict = PDFDict(); 383 $tdict->{'Prev'} = PDFNum($self->{' loc'}); 384 $tdict->{'Info'} = $self->{'Info'}; 385 if (defined $self->{' newroot'}) { 386 $tdict->{'Root'} = $self->{' newroot'}; 387 } else { 388 $tdict->{'Root'} = $self->{'Root'}; 389 } 390 $tdict->{'Size'} = $self->{'Size'}; 391 392 foreach my $key (grep { $_ !~ m/^\s/ } keys %$self) { 393 $tdict->{$key} = $self->{$key} unless defined $tdict->{$key}; 394 } 395 396 $fh->seek($self->{' epos'}, 0); 397 $self->out_trailer($tdict, $self->{' update'}); 398 close $self->{' OUTFILE'}; 399 400 return; 401} # end of append_file() 402 403=head2 $p->out_file($fname) 404 405Writes a PDF file to a file of the given filename, based on the current list of 406objects to be output. It creates the trailer dictionary based on information 407in C<$self>. 408 409$fname may be a string or an IO object. 410 411=cut 412 413sub out_file { 414 my ($self, $fname) = @_; 415 416 $self = $self->create_file($fname); 417 $self = $self->close_file(); 418 419 return $self; 420} 421 422=head2 $p->create_file($fname) 423 424Creates a new output file (no check is made of an existing open file) of 425the given filename or IO object. Note: make sure that C<< $p->{' version'} >> 426is set correctly before calling this function. 427 428=cut 429 430sub create_file { 431 my ($self, $filename) = @_; 432 my $fh; 433 434 $self->{' fname'} = $filename; 435 if (ref $filename) { 436 $fh = $filename; 437 } else { 438 $fh = IO::File->new(">$filename") || die "Unable to open $filename for writing"; 439 binmode($fh,':raw'); 440 } 441 442 $self->{' OUTFILE'} = $fh; 443 $fh->print('%PDF-' . ($self->{' version'} || '1.4') . "\n"); 444 $fh->print("%\xC6\xCD\xCD\xB5\n"); # and some binary stuff in a comment. 445 446 # PDF spec requires 4 or more "binary" bytes (128 or higher value) in a 447 # comment immediately following the PDF-x.y header, to alert reader that 448 # there is binary data. Actual values are apparently arbitrary. This DOES 449 # mean that other comments can NOT be inserted between the header and the 450 # binary comment! PDF::Builder always outputs this comment, so is always 451 # claiming binary data (no harm done?). 452 453 return $self; 454} 455 456=head2 $p->close_file() 457 458Closes up the open file for output, by outputting the trailer, etc. 459 460=cut 461 462sub close_file { 463 my $self = shift(); 464 465 my $tdict = PDFDict(); 466 $tdict->{'Info'} = $self->{'Info'} if defined $self->{'Info'}; 467 $tdict->{'Root'} = (defined $self->{' newroot'} and $self->{' newroot'} ne '') ? $self->{' newroot'} : $self->{'Root'}; 468 469 # remove all freed objects from the outlist, AND the outlist_cache if not updating 470 # NO! Don't do that thing! In fact, let out_trailer do the opposite! 471 472 $tdict->{'Size'} = $self->{'Size'} || PDFNum(1); 473 $tdict->{'Prev'} = PDFNum($self->{' loc'}) if $self->{' loc'}; 474 if ($self->{' update'}) { 475 foreach my $key (grep { $_ !~ m/^[\s\-]/ } keys %$self) { 476 $tdict->{$key} = $self->{$key} unless defined $tdict->{$key}; 477 } 478 479 my $fh = $self->{' INFILE'}; 480 $fh->seek($self->{' epos'}, 0); 481 } 482 483 $self->out_trailer($tdict, $self->{' update'}); 484 close($self->{' OUTFILE'}); 485 if ($^O eq 'MacOS' and not ref($self->{' fname'})) { 486 MacPerl::SetFileInfo('CARO', 'TEXT', $self->{' fname'}); 487 } 488 489 return $self; 490} # end of close_file() 491 492=head2 ($value, $str) = $p->readval($str, %opts) 493 494Reads a PDF value from the current position in the file. If C<$str> is too 495short, read some more from the current location in the file until the whole 496object is read. This is a recursive call which may slurp in a whole big stream 497(unprocessed). 498 499Returns the recursive data structure read and also the current C<$str> that has 500been read from the file. 501 502=cut 503 504sub readval { 505 my ($self, $str, %opts) = @_; 506 my $fh = $self->{' INFILE'}; 507 my ($result, $value); 508 509 my $update = defined($opts{'update'}) ? $opts{'update'} : 1; 510 $str = update($fh, $str) if $update; 511 512 $str =~ s/^$ws_char+//; # Ignore initial white space 513 $str =~ s/^\%[^\015\012]*$ws_char+//; # Ignore comments 514 515 if ($str =~ m/^<</s) { 516 # Dictionary 517 $str = substr ($str, 2); 518 $str = update($fh, $str) if $update; 519 $result = PDFDict(); 520 521 while ($str !~ m/^>>/) { 522 $str =~ s/^$ws_char+//; # Ignore initial white space 523 $str =~ s/^\%[^\015\012]*$ws_char+//; # Ignore comments 524 525 if ($str =~ s|^/($reg_char+)||) { 526 my $key = PDF::Builder::Basic::PDF::Name::name_to_string($1, $self); 527 ($value, $str) = $self->readval($str, %opts); 528 # per Vadim Repin (RT 131147) CHG 1. His conclusion is that 529 # it is highly unlikely, but remotely possible, that there 530 # could be legitimate use of Null objects that should NOT be 531 # prevented from bubbling up. If such a case is discovered, we 532 # might have to try Klaus Ethgen's more limited (in scope) 533 # patch in ./Pages.pm. See full discussion in RT 131147 for 534 # details on what's going on and how this fixes it. 535 #$result->{$key} = $value; # original code 536 $result->{$key} = $value 537 unless ref($value) eq 'PDF::Builder::Basic::PDF::Null'; 538 } elsif ($str =~ s|^/$ws_char+||) { 539 # fixes a broken key problem of acrobat. -- fredo 540 ($value, $str) = $self->readval($str, %opts); 541 $result->{'null'} = $value; 542 } elsif ($str =~ s|^//|/|) { 543 # fixes again a broken key problem of illustrator/enfocus. -- fredo 544 ($value, $str) = $self->readval($str, %opts); 545 $result->{'null'} = $value; 546 } else { 547 die "Invalid dictionary key"; 548 } 549 $str = update($fh, $str) if $update; # thanks gareth.jones@stud.man.ac.uk 550 } 551 552 $str =~ s/^>>//; 553 $str = update($fh, $str) if $update; 554 # streams can't be followed by a lone carriage-return. 555 # fredo: yes they can !!! -- use the MacOS, Luke. 556 # TBD isn't this covered by $cr as space CR? 557 if (($str =~ s/^stream(?:(?:\015\012)|\012|\015)//) and ($result->{'Length'}->val() != 0)) { # stream 558 my $length = $result->{'Length'}->val(); 559 $result->{' streamsrc'} = $fh; 560 $result->{' streamloc'} = $fh->tell() - length($str); 561 562 unless ($opts{'nostreams'}) { 563 if ($length > length($str)) { 564 $value = $str; 565 $length -= length($str); 566 read $fh, $str, $length + 11; # slurp the whole stream! 567 } else { 568 $value = ''; 569 } 570 $value .= substr($str, 0, $length); 571 $result->{' stream'} = $value; 572 $result->{' nofilt'} = 1; 573 $str = update($fh, $str, 1) if $update; # tell update we are in-stream and only need an endstream 574 $str = substr($str, index($str, 'endstream') + 9); 575 } 576 } 577 578 if (defined $result->{'Type'} and defined $types{$result->{'Type'}->val()}) { 579 bless $result, $types{$result->{'Type'}->val()}; 580 } 581 # gdj: FIXME: if any of the ws chars were crs, then the whole 582 # string might not have been read. 583 584 } elsif ($str =~ m/^([0-9]+)(?:$ws_char|$re_comment)+([0-9]+)(?:$ws_char|$re_comment)+R/s) { 585 # Indirect Object 586 my $num = $1; 587 $value = $2; 588 $str =~ s/^([0-9]+)(?:$ws_char|$re_comment)+([0-9]+)(?:$ws_char|$re_comment)+R//s; 589 unless ($result = $self->test_obj($num, $value)) { 590 $result = PDF::Builder::Basic::PDF::Objind->new(); 591 $result->{' objnum'} = $num; 592 $result->{' objgen'} = $value; 593 $self->add_obj($result, $num, $value); 594 } 595 $result->{' parent'} = $self; 596 weaken $result->{' parent'}; 597 #$result->{' realised'} = 0; 598 # removed to address changes being lost when an indirect object 599 # is realised twice 600 # gdj: FIXME: if any of the ws chars were crs, then the whole 601 # string might not have been read. 602 603 } elsif ($str =~ m/^([0-9]+)(?:$ws_char|$re_comment)+([0-9]+)(?:$ws_char|$re_comment)+obj/s) { 604 # Object 605 my $obj; 606 my $num = $1; 607 $value = $2; 608 $str =~ s/^([0-9]+)(?:$ws_char|$re_comment)+([0-9]+)(?:$ws_char|$re_comment)+obj//s; 609 ($obj, $str) = $self->readval($str, %opts); 610 if ($result = $self->test_obj($num, $value)) { 611 $result->merge($obj); 612 } else { 613 $result = $obj; 614 $self->add_obj($result, $num, $value); 615 $result->{' realised'} = 1; 616 } 617 $str = update($fh, $str) if $update; # thanks to kundrat@kundrat.sk 618 $str =~ s/^endobj//; 619 620 } elsif ($str =~ m|^/($reg_char*)|s) { 621 # Name 622 $value = $1; 623 $str =~ s|^/($reg_char*)||s; 624 $result = PDF::Builder::Basic::PDF::Name->from_pdf($value, $self); 625 626 } elsif ($str =~ m/^\(/) { 627 # Literal String 628 # We now need to find an unbalanced, unescaped right-paren. 629 # This can't be done with a regex. 630 my $value = '('; 631 $str = substr($str, 1); 632 633 my $nested_level = 1; 634 while (1) { 635 # Ignore everything up to the first escaped or parenthesis character 636 if ($str =~ /^([^\\()]+)(.*)/s) { 637 $value .= $1; 638 $str = $2; 639 } 640 641 # Ignore escaped parentheses 642 if ($str =~ /^(\\[()])/) { 643 $value .= $1; 644 $str = substr($str, 2); 645 646 } elsif ($str =~ /^\(/) { 647 # Left parenthesis: increase nesting 648 $value .= '('; 649 $str = substr($str, 1); 650 $nested_level++; 651 652 } elsif ($str =~ /^\)/) { 653 # Right parenthesis: decrease nesting 654 $value .= ')'; 655 $str = substr($str, 1); 656 $nested_level--; 657 last unless $nested_level; 658 659 } elsif ($str =~ /^(\\[^()])/) { 660 # Other escaped character 661 $value .= $1; 662 $str = substr($str, 2); 663 664 } else { 665 # If there wasn't an escaped or parenthesis character, 666 # read some more. 667 668 # We don't use update because we don't want to remove 669 # whitespace or comments. 670 $fh->read($str, 255, length($str)) or die 'Unterminated string.'; 671 } 672 } # end while(TRUE) loop 673 674 $result = PDF::Builder::Basic::PDF::String->from_pdf($value); 675 # end Literal String check 676 677 } elsif ($str =~ m/^</) { 678 # Hex String 679 $str =~ s/^<//; 680 $fh->read($str, 255, length($str)) while (0 > index($str, '>')); 681 ($value, $str) = ($str =~ /^(.*?)>(.*)/s); 682 $result = PDF::Builder::Basic::PDF::String->from_pdf('<' . $value . '>'); 683 684 } elsif ($str =~ m/^\[/) { 685 # Array 686 $str =~ s/^\[//; 687 $str = update($fh, $str) if $update; 688 $result = PDFArray(); 689 while ($str !~ m/^\]/) { 690 $str =~ s/^$ws_char+//; # Ignore initial white space 691 $str =~ s/^\%[^\015\012]*$ws_char+//; # Ignore comments 692 693 ($value, $str) = $self->readval($str, %opts); 694 $result->add_elements($value); 695 $str = update($fh, $str) if $update; # str might just be exhausted! 696 } 697 $str =~ s/^\]//; 698 699 } elsif ($str =~ m/^(true|false)($irreg_char|$)/) { 700 # Boolean 701 $value = $1; 702 $str =~ s/^(?:true|false)//; 703 $result = PDF::Builder::Basic::PDF::Bool->from_pdf($value); 704 705 } elsif ($str =~ m/^([+-.0-9]+)($irreg_char|$)/) { 706 # Number 707 $value = $1; 708 $str =~ s/^([+-.0-9]+)//; 709 710 # If $str only consists of whitespace (or is empty), call update to 711 # see if this is the beginning of an indirect object or reference 712 if ($update and ($str =~ /^$re_whitespace*$/s or $str =~ /^$re_whitespace+[0-9]+$re_whitespace*$/s)) { 713 $str =~ s/^$re_whitespace+/ /s; 714 $str =~ s/$re_whitespace+$/ /s; 715 $str = update($fh, $str); 716 if ($str =~ m/^$re_whitespace*([0-9]+)$re_whitespace+(?:R|obj)/s) { 717 return $self->readval("$value $str", %opts); 718 } 719 } 720 721 $result = PDF::Builder::Basic::PDF::Number->from_pdf($value); 722 723 } elsif ($str =~ m/^null($irreg_char|$)/) { 724 # Null 725 $str =~ s/^null//; 726 $result = PDF::Builder::Basic::PDF::Null->new(); 727 728 } else { 729 die "Can't parse `$str' near " . ($fh->tell()) . " length " . length($str) . "."; 730 } 731 732 $str =~ s/^$ws_char+//s; 733 return ($result, $str); 734} # end of readval() 735 736=head2 $ref = $p->read_obj($objind, %opts) 737 738Given an indirect object reference, locate it and read the object returning 739the read in object. 740 741=cut 742 743sub read_obj { 744 my ($self, $objind, %opts) = @_; 745 746 my $res = $self->read_objnum($objind->{' objnum'}, $objind->{' objgen'}, %opts) || return; 747 $objind->merge($res) unless $objind eq $res; 748 749 return $objind; 750} 751 752=head2 $ref = $p->read_objnum($num, $gen, %opts) 753 754Returns a fully read object of given number and generation in this file 755 756=cut 757 758sub read_objnum { 759 my ($self, $num, $gen, %opts) = @_; 760 761 croak 'Undefined object number in call to read_objnum($num, $gen)' unless defined $num; 762 croak 'Undefined object generation in call to read_objnum($num, $gen)' unless defined $gen; 763 croak "Invalid object number '$num' in call to read_objnum" unless $num =~ /^[0-9]+$/; 764 croak "Invalid object generation '$gen' in call to read_objnum" unless $gen =~ /^[0-9]+$/; 765 766 my $object_location = $self->locate_obj($num, $gen) || return; 767 my $object; 768 769 # Compressed object 770 if (ref($object_location)) { 771 my ($object_stream_num, $object_stream_pos) = @{$object_location}; 772 773 my $object_stream = $self->read_objnum($object_stream_num, 0, %opts); 774 die 'Cannot find the compressed object stream' unless $object_stream; 775 776 $object_stream->read_stream() if $object_stream->{' nofilt'}; 777 778 # An object stream starts with pairs of integers containing object numbers and 779 # stream offsets relative to the First key 780 my $fh; 781 my $pairs; 782 unless ($object_stream->{' streamfile'}) { 783 $pairs = substr($object_stream->{' stream'}, 0, $object_stream->{'First'}->val()); 784 } else { 785 CORE::open($fh, '<', $object_stream->{' streamfile'}); 786 read($fh, $pairs, $object_stream->{'First'}->val()); 787 } 788 my @map = split /\s+/, $pairs; 789 790 # Find the offset of the object in the stream 791 my $index = $object_stream_pos * 2; 792 die "Objind $num does not exist at index $index" unless $map[$index] == $num; 793 my $start = $map[$index + 1]; 794 795 # Unless this is the last object in the stream, its length is 796 # determined by the offset of the next object. 797 my $last_object_in_stream = $map[-2]; 798 my $length; 799 if ($last_object_in_stream == $num) { 800 if ($object_stream->{' stream'}) { 801 $length = length($object_stream->{' stream'}) - $object_stream->{'First'}->val() - $start; 802 } else { 803 $length = (-s $object_stream->{' streamfile'}) - $object_stream->{'First'}->val() - $start; 804 } 805 } else { 806 my $next_start = $map[$index + 3]; 807 $length = $next_start - $start; 808 } 809 810 # Read the object from the stream 811 my $stream = "$num 0 obj "; 812 unless ($object_stream->{' streamfile'}) { 813 $stream .= substr($object_stream->{' stream'}, $object_stream->{'First'}->val() + $start, $length); 814 } else { 815 seek($fh, $object_stream->{'First'}->val() + $start, 0); 816 read($fh, $stream, $length, length($stream)); 817 close $fh; 818 } 819 820 ($object) = $self->readval($stream, %opts, update => 0); 821 return $object; 822 } 823 824 my $current_location = $self->{' INFILE'}->tell(); 825 $self->{' INFILE'}->seek($object_location, 0); 826 ($object) = $self->readval('', %opts); 827 $self->{' INFILE'}->seek($current_location, 0); 828 829 return $object; 830} # end of read_objnum() 831 832=head2 $objind = $p->new_obj($obj) 833 834Creates a new, free object reference based on free space in the cross reference 835chain. If nothing is free, then think up a new number. If C<$obj>, then turns 836that object into this new object rather than returning a new object. 837 838=cut 839 840sub new_obj { 841 my ($self, $base) = @_; 842 my $res; 843 844 if (defined $self->{' free'} and scalar @{$self->{' free'}} > 0) { 845 $res = shift(@{$self->{' free'}}); 846 if (defined $base) { 847 my ($num, $gen) = @{$self->{' objects'}{$res->uid()}}; 848 $self->remove_obj($res); 849 $self->add_obj($base, $num, $gen); 850 return $self->out_obj($base); 851 } else { 852 $self->{' objects'}{$res->uid()}[2] = 0; 853 return $res; 854 } 855 } 856 857 my $tdict = $self; 858 my $i; 859 while (defined $tdict) { 860 $i = $tdict->{' xref'}{defined($i) ? $i : ''}[0]; 861 while (defined $i and $i != 0) { 862 my ($ni, $ng) = @{$tdict->{' xref'}{$i}}; 863 unless (defined $self->locate_obj($i, $ng)) { 864 if (defined $base) { 865 $self->add_obj($base, $i, $ng); 866 return $base; 867 } else { 868 $res = $self->test_obj($i, $ng) || $self->add_obj(PDF::Builder::Basic::PDF::Objind->new(), $i, $ng); 869 $self->out_obj($res); 870 return $res; 871 } 872 } 873 $i = $ni; 874 } 875 $tdict = $tdict->{' prev'}; 876 } 877 878 $i = $self->{' maxobj'}++; 879 if (defined $base) { 880 $self->add_obj($base, $i, 0); 881 $self->out_obj($base); 882 return $base; 883 } else { 884 $res = $self->add_obj(PDF::Builder::Basic::PDF::Objind->new(), $i, 0); 885 $self->out_obj($res); 886 return $res; 887 } 888} 889 890=head2 $p->out_obj($obj) 891 892Indicates that the given object reference should appear in the output xref 893table whether with data or freed. 894 895=cut 896 897sub out_obj { 898 my ($self, $obj) = @_; 899 900 # This is why we've been keeping the outlist CACHE around; to speed 901 # up this method by orders of magnitude (it saves up from having to 902 # grep the full outlist each time through as we'll just do a lookup 903 # in the hash) (which is super-fast). 904 unless (exists $self->{' outlist_cache'}{$obj}) { 905 push @{$self->{' outlist'}}, $obj; 906 # weaken $self->{' outlist'}->[-1]; 907 $self->{' outlist_cache'}{$obj} = 1; 908 } 909 910 return $obj; 911} 912 913=head2 $p->free_obj($obj) 914 915Marks an object reference for output as being freed. 916 917=cut 918 919sub free_obj { 920 my ($self, $obj) = @_; 921 922 push @{$self->{' free'}}, $obj; 923 $self->{' objects'}{$obj->uid()}[2] = 1; 924 $self->out_obj($obj); 925 926 return; 927} 928 929=head2 $p->remove_obj($objind) 930 931Removes the object from all places where we might remember it. 932 933=cut 934 935sub remove_obj { 936 my ($self, $objind) = @_; 937 938 # who says it has to be fast 939 delete $self->{' objects'}{$objind->uid()}; 940 delete $self->{' outlist_cache'}{$objind}; 941 delete $self->{' printed_cache'}{$objind}; 942 @{$self->{' outlist'}} = grep { $_ ne $objind } @{$self->{' outlist'}}; 943 @{$self->{' printed'}} = grep { $_ ne $objind } @{$self->{' printed'}}; 944 $self->{' objcache'}{$objind->{' objnum'}, $objind->{' objgen'}} = undef 945 if $self->{' objcache'}{$objind->{' objnum'}, $objind->{' objgen'}} eq $objind; 946 947 return $self; 948} 949 950=head2 $p->ship_out(@objects) 951 952=head2 $p->ship_out() 953 954Ships the given objects (or all objects for output if C<@objects> is empty) to 955the currently open output file (assuming there is one). Freed objects are not 956shipped, and once an object is shipped it is switched such that this file 957becomes its source and it will not be shipped again unless out_obj is called 958again. Notice that a shipped out object can be re-output or even freed, but 959that it will not cause the data already output to be changed. 960 961=cut 962 963sub ship_out { 964 my ($self, @objects) = @_; 965 966 return unless defined $self->{' OUTFILE'}; 967 my $fh = $self->{' OUTFILE'}; 968 seek($fh, 0, 2); # go to the end of the file 969 970 @objects = @{$self->{' outlist'}} unless scalar @objects > 0; 971 foreach my $objind (@objects) { 972 next unless $objind->is_obj($self); 973 my $j = -1; 974 for (my $i = 0; $i < scalar @{$self->{' outlist'}}; $i++) { 975 if ($self->{' outlist'}[$i] eq $objind) { 976 $j = $i; 977 last; 978 } 979 } 980 next if $j < 0; 981 splice(@{$self->{' outlist'}}, $j, 1); 982 delete $self->{' outlist_cache'}{$objind}; 983 next if grep { $_ eq $objind } @{$self->{' free'}}; 984 985 map { $fh->print("\% $_ \n") } split(/$cr/, $objind->{' comments'}) if $objind->{' comments'}; 986 $self->{' locs'}{$objind->uid()} = $fh->tell(); 987 my ($objnum, $objgen) = @{$self->{' objects'}{$objind->uid()}}[0..1]; 988 $fh->printf('%d %d obj ', $objnum, $objgen); 989 $objind->outobjdeep($fh, $self); 990 $fh->print("\nendobj\n"); 991 992 # Note that we've output this obj, not forgetting to update 993 # the cache of what's printed. 994 unless (exists $self->{' printed_cache'}{$objind}) { 995 push @{$self->{' printed'}}, $objind; 996 $self->{' printed_cache'}{$objind}++; 997 } 998 } 999 1000 return $self; 1001} # end of ship_out() 1002 1003=head2 $p->copy($outpdf, \&filter) 1004 1005Iterates over every object in the file reading the object, calling C<filter> 1006with the object, and outputting the result. If C<filter> is not defined, 1007just copies input to output. 1008 1009=cut 1010 1011sub copy { 1012 my ($self, $outpdf, $filter) = @_; 1013 my ($obj, $minl, $mini, $ming); 1014 1015 foreach my $key (grep { not m/^[\s\-]/ } keys %$self) { 1016 $outpdf->{$key} = $self->{$key} unless defined $outpdf->{$key}; 1017 } 1018 1019 my $tdict = $self; 1020 while (defined $tdict) { 1021 foreach my $i (sort {$a <=> $b} keys %{$tdict->{' xref'}}) { 1022 my ($nl, $ng, $nt) = @{$tdict->{' xref'}{$i}}; 1023 next unless $nt eq 'n'; 1024 1025 if ($nl < $minl or $mini == 0) { 1026 $mini = $i; 1027 $ming = $ng; 1028 $minl = $nl; 1029 } 1030 unless ($obj = $self->test_obj($i, $ng)) { 1031 $obj = PDF::Builder::Basic::PDF::Objind->new(); 1032 $obj->{' objnum'} = $i; 1033 $obj->{' objgen'} = $ng; 1034 $self->add_obj($obj, $i, $ng); 1035 $obj->{' parent'} = $self; 1036 weaken $obj->{' parent'}; 1037 $obj->{' realised'} = 0; 1038 } 1039 $obj->realise(); 1040 my $res = defined $filter ? &{$filter}($obj) : $obj; 1041 $outpdf->new_obj($res) unless (!$res || $res->is_obj($outpdf)); 1042 } 1043 $tdict = $tdict->{' prev'}; 1044 } 1045 1046 # test for linearized and remove it from output 1047 $obj = $self->test_obj($mini, $ming); 1048 if ($obj->isa('PDF::Builder::Basic::PDF::Dict') && $obj->{'Linearized'}) { 1049 $outpdf->free_obj($obj); 1050 } 1051 1052 return $self; 1053} # end of copy() 1054 1055=head1 PRIVATE METHODS & FUNCTIONS 1056 1057The following methods and functions are considered B<private> to this class. 1058This does not mean you cannot use them if you have a need, just that they 1059aren't really designed for users of this class. 1060 1061=head2 $offset = $p->locate_obj($num, $gen) 1062 1063Returns a file offset to the object asked for by following the chain of cross 1064reference tables until it finds the one you want. 1065 1066=cut 1067 1068sub locate_obj { 1069 my ($self, $num, $gen) = @_; 1070 1071 my $tdict = $self; 1072 while (defined $tdict) { 1073 if (ref $tdict->{' xref'}{$num}) { 1074 my $ref = $tdict->{' xref'}{$num}; 1075 return $ref unless scalar(@$ref) == 3; 1076 1077 if ($ref->[1] == $gen) { 1078 return $ref->[0] if $ref->[2] eq 'n'; 1079 return; # if $ref->[2] eq 'f'; 1080 } 1081 } 1082 $tdict = $tdict->{' prev'}; 1083 } 1084 1085 return; 1086} 1087 1088=head2 update($fh, $str, $instream) 1089 1090Keeps reading C<$fh> for more data to ensure that C<$str> has at least a line 1091full for C<readval> to work on. At this point we also take the opportunity to 1092ignore comments. 1093 1094=cut 1095 1096sub update { 1097 my ($fh, $str, $instream) = @_; 1098 1099 print STDERR 'fpos=' . tell($fh) . ' strlen=' . length($str) . "\n" if $readDebug; 1100 if ($instream) { 1101 # we are inside a (possible binary) stream 1102 # so we fetch data till we see an 'endstream' 1103 # -- fredo/2004-09-03 1104 while ($str !~ m/endstream/ and not $fh->eof()) { 1105 print STDERR 'fpos=' . tell($fh) . ' strlen=' . length($str) . "\n" if $readDebug; 1106 $fh->read($str, 314, length($str)); 1107 } 1108 } else { 1109 $str =~ s/^$ws_char*//; 1110 while ($str !~ m/$cr/ and not $fh->eof()) { 1111 print STDERR 'fpos=' . tell($fh) . ' strlen=' . length($str) . "\n" if $readDebug; 1112 $fh->read($str, 314, length($str)); 1113 $str =~ s/^$ws_char*//so; 1114 } 1115 while ($str =~ m/^\%/) { # restructured by fredo/2003-03-23 1116 print STDERR 'fpos=' . tell($fh) . ' strlen=' . length($str) . "\n" if $readDebug; 1117 $fh->read($str, 314, length($str)) while ($str !~ m/$cr/ and not $fh->eof()); 1118 $str =~ s/^\%[^\015\012]*$ws_char*//so; # fixed for reportlab -- fredo 1119 } 1120 } 1121 1122 return $str; 1123} # end of update() 1124 1125=head2 $objind = $p->test_obj($num, $gen) 1126 1127Tests the cache to see whether an object reference (which may or may not have 1128been getobj()ed) has been cached. Returns it if it has. 1129 1130=cut 1131 1132sub test_obj { 1133 my ($self, $num, $gen) = @_; 1134 1135 return $self->{' objcache'}{$num, $gen}; 1136} 1137 1138=head2 $p->add_obj($objind) 1139 1140Adds the given object to the internal object cache. 1141 1142=cut 1143 1144sub add_obj { 1145 my ($self, $obj, $num, $gen) = @_; 1146 1147 $self->{' objcache'}{$num, $gen} = $obj; 1148 $self->{' objects'}{$obj->uid()} = [$num, $gen]; 1149 # weaken $self->{' objcache'}{$num, $gen}; 1150 return $obj; 1151} 1152 1153=head2 $tdict = $p->readxrtr($xpos, %options) 1154 1155Recursive function which reads each of the cross-reference and trailer tables 1156in turn until there are no more. 1157 1158Returns a dictionary corresponding to the trailer chain. Each trailer also 1159includes the corresponding cross-reference table. 1160 1161The structure of the xref private element in a trailer dictionary is of an 1162anonymous hash of cross reference elements by object number. Each element 1163consists of an array of 3 elements corresponding to the three elements read 1164in [location, generation number, free or used]. See the PDF specification 1165for details. 1166 1167See C<open> for options allowed. 1168 1169=cut 1170 1171sub _unpack_xref_stream { 1172 my ($self, $width, $data) = @_; 1173 1174 # handle some oddball cases 1175 if ($width == 3) { 1176 $data = "\x00$data"; 1177 $width = 4; 1178 } elsif ($width == 5) { 1179 $data = "\x00\x00\x00$data"; 1180 $width = 8; 1181 } elsif ($width == 6) { 1182 $data = "\x00\x00$data"; 1183 $width = 8; 1184 } elsif ($width == 7) { 1185 $data = "\x00$data"; 1186 $width = 8; 1187 } 1188 # in all cases, "Network" (Big-Endian) byte order assumed 1189 return unpack('C', $data) if $width == 1; 1190 return unpack('n', $data) if $width == 2; 1191 return unpack('N', $data) if $width == 4; 1192 if ($width == 8) { 1193 # Some ways other packages handle this, without Perl-64, according 1194 # to Vadim Repin. Possibly they end up converting the value to 1195 # "double" behind the scenes if on a 32-bit platform? 1196 # PDF::Tiny return hex unpack('H16', $data); 1197 # CAM::PDF my @b = unpack('C*', $data); 1198 # my $i=0; ($i <<= 8) += shift @b while @b; return $i; 1199 1200 if (substr($data, 0, 4) eq "\x00\x00\x00\x00") { 1201 # can treat as 32 bit unsigned int 1202 return unpack('N', substr($data, 4, 4)); 1203 } else { 1204 # requires 64-bit platform (chip and Perl), else fatal error 1205 # it may blow up and produce a smoking crater if 32-bit Perl! 1206 # also note that Q needs Big-Endian flag (>) specified, else 1207 # it will use the native chip order (Big- or Little- Endian) 1208 return unpack('Q>', $data); 1209 } 1210 } 1211 1212 die "Unsupported field width: $width. 1-8 supported."; 1213} 1214 1215sub readxrtr { 1216 my ($self, $xpos, %options) = @_; 1217 # $xpos SHOULD be pointing to "xref" keyword 1218 my ($tdict, $buf, $xmin, $xnum, $xdiff); 1219 1220 my $fh = $self->{' INFILE'}; 1221 $fh->seek($xpos, 0); 1222 $fh->read($buf, 22); # 22 should overlap into first subsection 1223 $buf = update($fh, $buf); # fix for broken JAWS xref calculation. 1224 1225 my $xlist = {}; 1226 1227 ## it seems that some products calculate wrong prev entries (short) 1228 ## so we seek ahead to find one -- fredo; save for now 1229 #while ($buf !~ m/^xref$cr/i && !eof($fh)) { 1230 # $buf =~ s/^(\s+|\S+|.)//i; 1231 # $buf = update($fh, $buf); 1232 #} 1233 1234 if ($buf =~ s/^xref$cr//i) { # remove xrefEOL from buffer 1235 # Plain XRef tables. 1236 # 1237 # look to match startobj# count# EOL of first (or only) subsection 1238 # supposed to be single ASCII space between numbers, but this is 1239 # more lenient for some writers, allowing 1 or more whitespace 1240 my $subsection_count = 0; 1241 my $entry_format_error = 0; 1242 my $xrefListEmpty = 0; 1243 1244 while ($buf =~ m/^$ws_char*([0-9]+)$ws_char+([0-9]+)$ws_char*$cr(.*?)$/s) { 1245 my $old_buf = $buf; 1246 $xmin = $1; # starting object number of this subsection 1247 $xnum = $2; # number of entries in this subsection 1248 $buf = $3; # remainder of buffer 1249 $subsection_count++; 1250 # go back and warn if other than single space separating numbers 1251 unless ($old_buf =~ /^[0-9]+ [0-9]+$cr/) { #orig 'warn' 1252 if ($options{'-diags'} == 1) { 1253 # See PDF 1.7 section 7.5.4: Cross-Reference Table 1254 warn "Malformed xref: subsection header needs a single\n" . 1255 "ASCII space between the numbers and no extra spaces.\n"; 1256 } 1257 } 1258 $xdiff = length($buf); # how much remaining in buffer 1259 1260 # in case xnum == 0 is permitted (or used and tolerated by readers), 1261 # skip over entry reads and go to next subsection 1262 if ($xnum < 1) { 1263 if ($options{'-diags'} == 1) { 1264 warn "Xref subsection has 0 entries. Skipped.\n"; 1265 } 1266 $xrefListEmpty = 1; 1267 next; 1268 } 1269 1270 # read chunk of entire subsection list 1271 my $entry_size = 20; 1272 # test read first entry, see if $cr in expected place, adjust size 1273 $fh->read($buf, $entry_size * 1 - $xdiff + 15, $xdiff); 1274 if ($buf =~ m/^(.*?)$cr/) { 1275 $entry_size = length($1) + 2; 1276 } 1277 if ($entry_size != 20 && $options{'-diags'} == 1) { 1278 warn "Xref entries supposed to be 20 bytes long, are $entry_size.\n"; 1279 } 1280 $xdiff = length($buf); 1281 1282 # read remaining entries 1283 $fh->read($buf, $entry_size * $xnum - $xdiff + 15, $xdiff); 1284 # each entry is two integers and flag. spec says single ASCII space 1285 # between each field and certain length for each (10, 5, 1), so 1286 # this appears to be more lenient than spec 1287 # is object 0 supposed to be in subsection 1, or is any place OK? 1288 while ($xnum-- > 0 and 1289 $buf =~ m/^$ws_char*(\d+)$ws_char+(\d+)$ws_char+([nf])$ws_char*$cr/) { 1290 # check if format doesn't match spec 1291 if ($buf =~ m/^\d{10} \d{5} [nf]$cr/ || 1292 $entry_format_error) { 1293 # format OK or have already reported format problem 1294 } else { 1295 if ($options{'-diags'} == 1) { 1296 warn "Xref entry readable, but doesn't meet PDF spec.\n"; 1297 } 1298 $entry_format_error++; 1299 } 1300 1301 $buf =~ s/^$ws_char*(\d+)$ws_char+(\d+)$ws_char+([nf])$ws_char*$cr//; 1302 # $1 = object's starting offset in file (n) or 1303 # next object in free list (f) [0 if last] 1304 # $2 = generation number (n) or 65535 for object 0 (f) or 1305 # next generation number (f) 1306 # $3 = flag (n = object in use, f = free) 1307 # buf reduced by entry just processed 1308 if (exists $xlist->{$xmin}) { 1309 if ($options{'-diags'} == 1) { 1310 warn "Duplicate object number $xmin in xref table ignored.\n"; 1311 } 1312 } else { 1313 $xlist->{$xmin} = [$1, $2, $3]; 1314 if ($xmin == 0 && $subsection_count > 1 && $options{'-diags'} == 1) { 1315 warn "Xref object 0 entry not in first subsection.\n"; 1316 } 1317 } 1318 $xmin++; 1319 } # traverse one subsection for objects xmin through xmin+xnum-1 1320 # go back for next subsection (if any) 1321 } # loop through xref subsections 1322 # fall through to here when run out of xref subsections 1323 # xlist should have two or more object entries, may not be contiguous 1324 1325 # should have an object 0 1326 # at this point, no idea if object 0 was in first subsection (legal?) 1327 # could attempt a fixup if no object 0 found. many fixups are quite 1328 # risky and could end up corrupting the free list. 1329 # there's no guarantee that a proper free list will result, but any 1330 # error should hopefully be caught further on 1331 if (!exists $xlist->{'0'} && !$xrefListEmpty) { 1332 # for now, 1 subsection starting with 1, and only object 1 in 1333 # free list, try to fix up 1334 if ($subsection_count == 1 && exists $xlist->{'1'}) { 1335 # apparently a common enough error in PDF writers 1336 1337 if ($xlist->{'1'}[0] == 0 && # only member of free list 1338 $xlist->{'1'}[1] == 65535 && 1339 $xlist->{'1'}[2] eq 'f') { 1340 # object 1 appears to be the free list head, so shift 1341 # down all objects 1342 if ($options{'-diags'} == 1) { 1343 warn "xref appears to be mislabeled starting with 1. Shift down all elements.\n"; 1344 } 1345 my $next = 1; 1346 while (exists $xlist->{$next}) { 1347 $xlist->{$next - 1} = $xlist->{$next}; 1348 $next++; 1349 } 1350 delete $xlist->{--$next}; 1351 1352 } else { 1353 # if object 1 does not appear to be a free list head, 1354 # insert a new object 0 1355 if ($options{'-diags'} == 1) { 1356 warn "Xref appears to be missing object 0. Insert a new one.\n"; 1357 } 1358 $xlist->{'0'} = [0, 65535, 'f']; 1359 } 1360 } else { 1361 if ($options{'-diags'} == 1) { 1362 warn "Malformed cross reference list in PDF file $self->{' fname'} -- no object 0 (free list head)\n"; 1363 } 1364 $xlist->{'0'} = [0, 65535, 'f']; 1365 } 1366 } # no object 0 entry 1367 1368 # build/validate the free list (and no active objects have f flag) 1369 my @free_list; 1370 foreach (sort {$a <=> $b} keys %{ $xlist }) { 1371 # if 'f' flag, is in free list 1372 if ($xlist->{$_}[2] eq 'f') { 1373 if ($xlist->{$_}[1] <= 0 && $options{'-diags'} == 1) { 1374 warn "Xref free list entry $_ with bad next generation number.\n"; 1375 } else { 1376 push @free_list, $_; # should be in numeric order (0 first) 1377 } 1378 } elsif ($xlist->{$_}[2] eq 'n') { 1379 if ($xlist->{$_}[0] <= 0 && $options{'-diags'} == 1) { 1380 warn "Xref active object $_ entry with bad length ".($xlist->{$_}[1])."\n"; 1381 } 1382 if ($xlist->{$_}[1] < 0 && $options{'-diags'} == 1) { 1383 warn "Xref active object $_ entry with bad generation number ".($xlist->{$_}[1])."\n"; 1384 } 1385 } else { 1386 if ($options{'-diags'} == 1) { 1387 warn "Xref entry has flag that is not 'f' or 'n'.\n"; 1388 } 1389 } 1390 } # go through xlist and build free_list and check entries 1391 # traverse free list and check that "next object" is also in free list 1392 my $next_free = 0; # object 0 should always be in free list 1393 if ($xlist->{'0'}[1] != 65535 && $options{'-diags'} == 1) { 1394 warn "Object 0 next generation is not 65535.\n"; 1395 } 1396 do { 1397 if ($xlist->{$next_free}[2] ne 'f') { 1398 if ($options{'-diags'} == 1) { 1399 warn "Corrupted free object list: next=$next_free is not a free object.\n"; 1400 } 1401 $next_free = 0; # force end of free list 1402 } else { 1403 $next_free = $xlist->{$next_free}[0]; 1404 } 1405 # remove this entry from free list array 1406 splice(@free_list, index(@free_list, $next_free), 1); 1407 } while ($next_free && exists $xlist->{$next_free}); 1408 if (scalar @free_list && $options{'-diags'} == 1) { 1409 warn "Corrupted xref list: object(s) @free_list marked as free, but are not in free chain.\n"; 1410 } 1411 1412 # done with cross reference table, so go on to trailer 1413 if ($buf !~ /^\s*trailer\b/i && $options{'-diags'} == 1) { #orig 'die' 1414 warn "Malformed trailer in PDF file $self->{' fname'} at " . ($fh->tell() - length($buf)); 1415 } 1416 1417 $buf =~ s/^\s*trailer\b//i; 1418 1419 ($tdict, $buf) = $self->readval($buf); 1420 1421 } elsif ($buf =~ m/^(\d+)\s+(\d+)\s+obj/i) { 1422 my ($xref_obj, $xref_gen) = ($1, $2); 1423 1424 PDF::Builder->verCheckOutput(1.5, "importing cross-reference stream"); 1425 # XRef streams 1426 ($tdict, $buf) = $self->readval($buf); 1427 1428 unless ($tdict->{' stream'}) { 1429 if ($options{'-diags'} == 1) { 1430 warn "Malformed XRefStm at $xref_obj $xref_gen obj in PDF file $self->{' fname'}"; 1431 } 1432 } 1433 $tdict->read_stream(1); 1434 1435 my $stream = $tdict->{' stream'}; 1436 my @widths = map { $_->val() } @{$tdict->{'W'}->val()}; 1437 1438 my $start = 0; 1439 my $last; 1440 1441 my @index; 1442 if (defined $tdict->{'Index'}) { 1443 @index = map { $_->val() } @{$tdict->{'Index'}->val()}; 1444 } else { 1445 @index = (0, $tdict->{'Size'}->val()); 1446 } 1447 1448 while (scalar @index) { 1449 $start = shift(@index); 1450 $last = $start + shift(@index) - 1; 1451 1452 for my $i ($start...$last) { 1453 # Replaced "for $xmin" because it creates a loop-specific local 1454 # variable, and we need $xmin to be correct for maxobj below. 1455 $xmin = $i; 1456 1457 my @cols; 1458 1459 for my $w (@widths) { 1460 my $data; 1461 $data = $self->_unpack_xref_stream($w, substr($stream, 0, $w, '')) if $w; 1462 1463 push @cols, $data; 1464 } 1465 1466 $cols[0] = 1 unless defined $cols[0]; 1467 if ($cols[0] > 2 && $options{'-diags'} == 1) { 1468 warn "Invalid XRefStm entry type ($cols[0]) at $xref_obj $xref_gen obj"; 1469 } 1470 1471 next if exists $xlist->{$xmin}; 1472 1473 my @objind = ($cols[1], defined($cols[2]) ? $cols[2] : ($xmin ? 0 : 65535)); 1474 push @objind, ($cols[0] == 0? 'f': 'n') if $cols[0] < 2; 1475 1476 $xlist->{$xmin} = \@objind; 1477 } 1478 } 1479 1480 } else { #orig 'die' 1481 if ($options{'-diags'} == 1) { 1482 warn "Malformed xref in PDF file $self->{' fname'}"; 1483 } 1484 } 1485 1486 # did we get to here without managing to set $xmin? 1487 $xmin ||= 0; 1488 1489 $tdict->{' loc'} = $xpos; 1490 $tdict->{' xref'} = $xlist; 1491 $self->{' maxobj'} = $xmin + 1 if $xmin + 1 > $self->{' maxobj'}; 1492 $tdict->{' prev'} = $self->readxrtr($tdict->{'Prev'}->val(), %options) 1493 if (defined $tdict->{'Prev'} and $tdict->{'Prev'}->val() != 0); 1494 delete $tdict->{' prev'} unless defined $tdict->{' prev'}; 1495 1496 return $tdict; 1497} # end of readxrtr() 1498 1499=head2 $p->out_trailer($tdict, $update) 1500 1501=head2 $p->out_trailer($tdict) 1502 1503Outputs the body and trailer for a PDF file by outputting all the objects in 1504the ' outlist' and then outputting a xref table for those objects and any 1505freed ones. It then outputs the trailing dictionary and the trailer code. 1506 1507=cut 1508 1509sub out_trailer { 1510 my ($self, $tdict, $update) = @_; 1511 1512 my $fh = $self->{' OUTFILE'}; 1513 1514 while (@{$self->{' outlist'}}) { 1515 $self->ship_out(); 1516 } 1517 1518 # $size = @{$self->{' printed'}} + @{$self->{' free'}}; 1519 # $tdict->{'Size'} = PDFNum($tdict->{'Size'}->val() + $size); 1520 # PDFSpec 1.3 says for /Size: (Required) Total number of entries in the file's 1521 # cross-reference table, including the original table and all updates. Which 1522 # is what the previous two lines implement. 1523 # But this seems to make Acrobat croak on saving so we try the following from 1524 # basil.duval@epfl.ch 1525 $tdict->{'Size'} = PDFNum($self->{' maxobj'}); 1526 1527 my $tloc = $fh->tell(); 1528## $fh->print("xref\n"); 1529 # instead of directly outputting (fh->print) xreflist, we accumulate in @out 1530 my @out; 1531 my @xreflist = sort { $self->{' objects'}{$a->uid()}[0] <=> $self->{' objects'}{$b->uid()}[0] } (@{$self->{' printed'} || []}, @{$self->{' free'} || []}); 1532 1533 my ($i, $j, $k); 1534 unless ($update) { 1535 $i = 1; 1536 for ($j = 0; $j < @xreflist; $j++) { 1537 my @inserts; 1538 $k = $xreflist[$j]; 1539 while ($i < $self->{' objects'}{$k->uid()}[0]) { 1540 my ($n) = PDF::Builder::Basic::PDF::Objind->new(); 1541 $self->add_obj($n, $i, 0); 1542 $self->free_obj($n); 1543 push(@inserts, $n); 1544 $i++; 1545 } 1546 splice(@xreflist, $j, 0, @inserts); 1547 $j += @inserts; 1548 $i++; 1549 } 1550 } 1551 1552 my @freelist = sort { $self->{' objects'}{$a->uid()}[0] <=> $self->{' objects'}{$b->uid()}[0] } @{$self->{' free'} || []}; 1553 1554 $j = 0; my $first = -1; $k = 0; 1555 for ($i = 0; $i <= $#xreflist + 1; $i++) { 1556 # if ($i == 0) { 1557 # $first = $i; $j = $xreflist[0]->{' objnum'}; 1558 # $fh->printf("0 1\n%010d 65535 f \n", $ff); 1559 # } 1560 if ($i > $#xreflist || $self->{' objects'}{$xreflist[$i]->uid()}[0] != $j + 1) { 1561## $fh->print(($first == -1 ? "0 " : "$self->{' objects'}{$xreflist[$first]->uid()}[0] ") . ($i - $first) . "\n"); 1562 push @out, ($first == -1 ? "0 " : "$self->{' objects'}{$xreflist[$first]->uid()}[0] ") . ($i - $first) . "\n"; 1563 if ($first == -1) { 1564## $fh->printf("%010d 65535 f \n", defined $freelist[$k] ? $self->{' objects'}{$freelist[$k]->uid()}[0] : 0); 1565 push @out, sprintf("%010d 65535 f \n", defined $freelist[$k] ? $self->{' objects'}{$freelist[$k]->uid()}[0] : 0); 1566 $first = 0; 1567 } 1568 for ($j = $first; $j < $i; $j++) { 1569 my $xref = $xreflist[$j]; 1570 if (defined $freelist[$k] && defined $xref && "$freelist[$k]" eq "$xref") { 1571 $k++; 1572## $fh->print(pack("A10AA5A4", 1573 push(@out, pack("A10AA5A4", 1574 sprintf("%010d", (defined $freelist[$k] ? 1575 $self->{' objects'}{$freelist[$k]->uid()}[0] : 0)), " ", 1576 sprintf("%05d", $self->{' objects'}{$xref->uid()}[1] + 1), 1577 " f \n")); 1578 } else { 1579## $fh->print(pack("A10AA5A4", sprintf("%010d", $self->{' locs'}{$xref->uid()}), " ", 1580 push(@out, pack("A10AA5A4", sprintf("%010d", $self->{' locs'}{$xref->uid()}), " ", 1581 sprintf("%05d", $self->{' objects'}{$xref->uid()}[1]), 1582 " n \n")); 1583 } 1584 } 1585 $first = $i; 1586 $j = $self->{' objects'}{$xreflist[$i]->uid()}[0] if ($i < scalar @xreflist); 1587 1588 } else { 1589 $j++; 1590 } 1591 } # end for loop through xreflists 1592## $fh->print("trailer\n"); 1593## $tdict->outobjdeep($fh, $self); 1594## $fh->print("\nstartxref\n$tloc\n%%EOF\n"); 1595## start new code for 117184 fix by Vadim. @out has array of xref content 1596 if (exists $tdict->{'Type'} and $tdict->{'Type'}->val() eq 'XRef') { 1597 1598 my (@index, @stream); 1599 for (@out) { # @out is the accumulated cross reference list 1600 my @a = split; 1601 @a == 2 ? push @index, @a : push @stream, \@a; 1602 } 1603 my $i = $self->{' maxobj'}++; 1604 $self->add_obj($tdict, $i, 0); 1605 $self->out_obj($tdict); 1606 1607 push @index, $i, 1; 1608 push @stream, [ $tloc, 0, 'n' ]; 1609 1610 my $len = $tloc > 0xFFFF ? 4 : 2; # don't expect files > 4 Gb 1611 my $tpl = $tloc > 0xFFFF ? 'CNC' : 'CnC'; # don't expect gennum > 255, it's absurd. 1612 # Adobe doesn't use them anymore anyway 1613 my $sstream = ''; 1614 my @prev = ( 0 ) x ( $len + 2 ); # init prev to all 0's 1615 for (@stream) { 1616 # OK to zero out gennum of 65535 for a cross reference stream, 1617 # rather than just truncating to 255 -- Vadim 1618 $_->[ 1 ] = 0 if $_->[ 1 ] == 65535 and 1619 $_->[ 2 ] eq 'f'; 1620 # make sure is 0..255, since will pack with 'C' code -- Phil 1621 if ($_->[1] > 0xFF) { 1622 print "generation number ".($_->[1])." in entry '$_->[0] $_->[1] $_->[2]' exceeds 256, reduced to ".($_->[1] & 0x00FF)."\n"; 1623 } 1624 $_->[ 1 ] &= 0x00FF; 1625 my @line = unpack 'C*', pack $tpl, $_->[ 2 ] eq 'n'? 1 : 0, @{ $_ }[ 0 .. 1 ]; 1626 1627 $sstream .= pack 'C*', 2, # prepend filtering method, "PNG Up" 1628 map {($line[ $_ ] - $prev[ $_ ] + 256) % 256} 0 .. $#line; 1629 @prev = @line; 1630 } 1631 # build a dictionary for the cross reference stream 1632 $tdict->{'Size'} = PDFNum($i + 1); 1633 $tdict->{'Index'} = PDFArray(map { PDFNum($_) } @index); 1634 $tdict->{'W'} = PDFArray(map { PDFNum($_) } 1, $len, 1); 1635 $tdict->{'Filter'} = PDFName('FlateDecode'); 1636 1637 # it's compressed 1638 $tdict->{'DecodeParms'} = PDFDict(); 1639 $tdict->{'DecodeParms'}->val()->{'Predictor'} = PDFNum(12); 1640 $tdict->{'DecodeParms'}->val()->{'Columns'} = PDFNum($len + 2); 1641 1642 $sstream = PDF::Builder::Basic::PDF::Filter::FlateDecode->new()->outfilt($sstream, 1); 1643 $tdict->{' stream'} = $sstream; 1644 $tdict->{' nofilt'} = 1; 1645 delete $tdict->{'Length'}; 1646 $self->ship_out(); 1647 } else { 1648 # delete may be moved later by Vadim closer to where XRefStm created 1649 delete $tdict->{'XRefStm'}; 1650 # almost the original code 1651 $fh->print("xref\n", @out, "trailer\n"); 1652 $tdict->outobjdeep($fh, $self); 1653 $fh->print("\n"); 1654 } 1655 $fh->print("startxref\n$tloc\n%%EOF\n"); 1656## end of new code 1657 1658 return; 1659} # end of out_trailer() 1660 1661=head2 PDF::Builder::Basic::PDF::File->_new() 1662 1663Creates a very empty PDF file object (used by new() and open()) 1664 1665=cut 1666 1667sub _new { 1668 my $class = shift(); 1669 my $self = {}; 1670 1671 bless $self, $class; 1672 $self->{' outlist'} = []; 1673 $self->{' outlist_cache'} = {}; # A cache of what's in the 'outlist' 1674 $self->{' maxobj'} = 1; 1675 $self->{' objcache'} = {}; 1676 $self->{' objects'} = {}; 1677 1678 return $self; 1679} 1680 16811; 1682 1683=head1 AUTHOR 1684 1685Martin Hosken Martin_Hosken@sil.org 1686 1687Copyright Martin Hosken 1999 and onwards 1688 1689No warranty or expression of effectiveness, least of all regarding anyone's 1690safety, is implied in this software or documentation. 1691